diff options
Diffstat (limited to 'net')
561 files changed, 21748 insertions, 7563 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c index 7008d53e455c..4fa2fdda174d 100644 --- a/net/6lowpan/nhc.c +++ b/net/6lowpan/nhc.c @@ -27,8 +27,8 @@ static int lowpan_nhc_insert(struct lowpan_nhc *nhc) /* Figure out where to put new node */ while (*new) { - struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc, - node); + struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc, + node); int result, len_dif, len; len_dif = nhc->idlen - this->idlen; @@ -69,8 +69,8 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb) const u8 *nhcid_skb_ptr = skb->data; while (node) { - struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc, - node); + struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc, + node); u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN]; int result, i; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 10da6c588bf8..e97ab824e368 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, return 0; } -static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +static void vlan_dev_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; @@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; - - return stats; } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -792,8 +791,6 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, - .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, - .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = switchdev_port_fdb_add, .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, diff --git a/net/Kconfig b/net/Kconfig index a1005007224c..102f781a0131 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -57,6 +57,7 @@ source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" +source "net/smc/Kconfig" config INET bool "TCP/IP networking" @@ -258,10 +259,6 @@ config XPS config HWBM bool -config SOCK_CGROUP_DATA - bool - default n - config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS @@ -300,7 +297,8 @@ config BPF_JIT Note, admin should enable this feature changing: /proc/sys/net/core/bpf_jit_enable - /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_kallsyms (optional) config NET_FLOW_LIMIT bool @@ -393,6 +391,8 @@ source "net/9p/Kconfig" source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +source "net/psample/Kconfig" +source "net/ife/Kconfig" config LWTUNNEL bool "Network light weight tunnels" @@ -414,6 +414,10 @@ config DST_CACHE bool default n +config GRO_CELLS + bool + default n + config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help diff --git a/net/Makefile b/net/Makefile index 4cafaa2b4667..9b681550e3a3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ obj-$(CONFIG_IUCV) += iucv/ +obj-$(CONFIG_SMC) += smc/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ obj-$(CONFIG_CAIF) += caif/ @@ -69,6 +70,8 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ obj-$(CONFIG_CEPH_LIB) += ceph/ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ +obj-$(CONFIG_PSAMPLE) += psample/ +obj-$(CONFIG_NET_IFE) += ife/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ obj-$(CONFIG_MPLS) += mpls/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 10d2bdce686e..465cc24b41e5 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) ddp->deh_dport = usat->sat_port; ddp->deh_sport = at->src_port; - SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); + SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len); err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) { @@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) */ aarp_send_ddp(dev, skb, &usat->sat_addr, NULL); } - SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len); + SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len); out: release_sock(sk); diff --git a/net/atm/lec.c b/net/atm/lec.c index 019557d0a11d..09cfe87f0a44 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1059,7 +1059,9 @@ static void __exit lane_module_cleanup(void) { int i; +#ifdef CONFIG_PROC_FS remove_proc_entry("lec", atm_proc_root); +#endif deregister_atm_ioctl(&lane_ioctl_ops); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 3b3b1a292ec8..a190800572bd 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr, return; } if (end_of_tlvs - tlvs != 0) - pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n", + pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n", dev->name, end_of_tlvs - tlvs); } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 4855d18a8511..038b109b2be7 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -264,7 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); - if (!sock_flag(ax25->sk, SOCK_DESTROY)) + if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index f724d3c98a81..915987bc6d29 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 623d04302aa2..44fd073b7546 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 3b5b69cdd12b..29f6312f9bf1 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f00f666e2ccd..7c3d994e90d8 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index b9f3550faaf7..ae2ab526bdb1 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 2ac612d7bab4..0acd081dd286 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 83b77639729e..dd7c4b647e6b 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index f2fb2f05b6bf..b90c9903e246 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index be17c0b1369e..376ead280ab9 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 38b9aab83fc0..03a35c9f456d 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 4c4d45caa422..2068770b542d 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 032271421a20..2b070c7e31da 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 0e6e9d09078c..cc262c9d97e0 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index e7f690b571ea..ba8420d8a992 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -449,7 +449,6 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - soft_iface->last_rx = jiffies; netif_rx(skb); out: diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 1ae93e46fb98..e157986bd01c 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -20,6 +20,8 @@ #include "main.h" +#include <linux/compiler.h> +#include <linux/stddef.h> #include <linux/types.h> struct net_device; @@ -27,6 +29,22 @@ struct netlink_callback; struct seq_file; struct sk_buff; +/** + * batadv_bla_is_loopdetect_mac - check if the mac address is from a loop detect + * frame sent by bridge loop avoidance + * @mac: mac address to check + * + * Return: true if the it looks like a loop detect frame + * (mac starts with BA:BE), false otherwise + */ +static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac) +{ + if (mac[0] == 0xba && mac[1] == 0xbe) + return true; + + return false; +} + #ifdef CONFIG_BATMAN_ADV_BLA bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast); diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 77925504379d..e32ad47c6efd 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -19,7 +19,7 @@ #include "main.h" #include <linux/debugfs.h> -#include <linux/device.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fs.h> diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index e49121ee55f6..9c5d4a65b98c 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 49576c5a3fe3..1bfd1dbc2feb 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -1050,7 +1050,6 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, bat_priv->soft_iface); bat_priv->stats.rx_packets++; bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; - bat_priv->soft_iface->last_rx = jiffies; netif_rx(skb_new); batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 813ecea96cf9..ec364a3c1c66 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 11149e5be4e0..11a23fd6e1a0 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index b95f619606af..1a2d6c308745 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 52b8bd6ec431..de9955d5224d 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 859166d03561..3baa3d466e5e 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 21184810d89f..5db2e43e3775 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 8a5e1ddf1175..0a6a97d201f2 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 61a431a9772b..e348f76ea8c1 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index d6309a423629..9f9890ff7a22 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index a0a0fdb85805..b5f7e13918ac 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 557a7044cfbc..0c905e91c5e2 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index b310f381ae02..6308c9f0fd96 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index e44a7da51431..f3fec40aae86 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index c73c31769aba..4ef4bde2cc2d 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 3284a7b0325d..7a2b9f4da078 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d46415edd3be..5000c540614d 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index a6cc8040a21d..57a8103dbce7 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.5" +#define BATADV_SOURCE_VERSION "2017.0" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 090a69fc342e..952ba81a565b 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 2cddaf52a21d..2a78cddab0e9 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 062738163bdc..ab13b4d58733 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 52eb16281aba..f1cd8c5da966 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index ab5a3bf0765f..e1f6fc72fe3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index d6d7fb4ec5d5..c66efb81d2f4 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8f3b2969cc4e..8e2a4b205257 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index ebc56183f358..d94220a6d21a 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 7a36bcfa0ba0..8e8a5db197cb 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 6713bdf414cd..7fd740b6e36d 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -719,20 +719,19 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, len = skb->len; res = batadv_send_skb_to_orig(skb, orig_node, recv_if); - if (res == NET_XMIT_SUCCESS) - ret = NET_RX_SUCCESS; - - /* skb was consumed */ - skb = NULL; /* translate transmit result into receive result */ if (res == NET_XMIT_SUCCESS) { + ret = NET_RX_SUCCESS; /* skb was transmitted and consumed */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, len + ETH_HLEN); } + /* skb was consumed */ + skb = NULL; + put_orig_node: batadv_orig_node_put(orig_node); free_skb: diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 05c3ff42e181..5ede16c32f15 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 49021b7124f3..1489ec27daff 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -77,6 +77,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, { struct batadv_priv *bat_priv; struct ethhdr *ethhdr; + int ret; bat_priv = netdev_priv(hard_iface->soft_iface); @@ -115,7 +116,8 @@ int batadv_send_skb_packet(struct sk_buff *skb, * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. */ - return dev_queue_xmit(skb); + ret = dev_queue_xmit(skb); + return net_xmit_eval(ret); send_skb_err: kfree_skb(skb); return NET_XMIT_DROP; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index a94e1e8639ca..f21166d10323 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7b3494ae6ad9..5d099b2e6cfc 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -258,7 +258,8 @@ static int batadv_interface_tx(struct sk_buff *skb, ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ - if (!is_multicast_ether_addr(ethhdr->h_source)) { + if (!is_multicast_ether_addr(ethhdr->h_source) && + !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); @@ -481,8 +482,6 @@ void batadv_interface_rx(struct net_device *soft_iface, batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - soft_iface->last_rx = jiffies; - /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index ec303ddbf647..639c3abb214a 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 17c844196eb2..0ae8b30e4eaa 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index c76021b4e198..e487412e256b 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 981e8c5b07e9..c94ebdecdc3d 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * @@ -23,7 +23,7 @@ #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> -#include <linux/device.h> +#include <linux/err.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/if_ether.h> diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index ba922c425e56..a8ada5c123bd 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 30ecbfb40adf..6077a87d46f0 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -3714,7 +3714,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; - u16 changed_num = 0; struct hlist_head *head; u32 i; @@ -3736,7 +3735,6 @@ static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, continue; tt_common_entry->flags &= ~flags; } - changed_num++; if (!count) continue; diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 783fdba84db2..411d586191da 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index a783420356ae..1d9e267caec9 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index e4369b547b43..4d01400ada30 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index e913aee28c98..8f64a5c01345 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1904a93f47d5..d491529332f4 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -920,7 +920,7 @@ static void chan_close_cb(struct l2cap_chan *chan) BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %d", chan, - atomic_read(&chan->kref.refcount)); + kref_read(&chan->kref)); l2cap_chan_put(chan); break; diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 5f123c3320a7..f0095fd79818 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -810,7 +810,7 @@ static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked) /* AMP Manager functions */ struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr) { - BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); + BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref)); kref_get(&mgr->kref); @@ -833,7 +833,7 @@ static void amp_mgr_destroy(struct kref *kref) int amp_mgr_put(struct amp_mgr *mgr) { - BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount)); + BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref)); return kref_put(&mgr->kref, &_mgr_destroy); } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1aff2da9bc74..cfb2faba46de 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -245,7 +245,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); - if (bt_sk(sk)->skb_msg_name) + if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); } diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index e32f34189007..02a4ccc04e1e 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -24,7 +24,7 @@ void amp_ctrl_get(struct amp_ctrl *ctrl) { BT_DBG("ctrl %p orig refcnt %d", ctrl, - atomic_read(&ctrl->kref.refcount)); + kref_read(&ctrl->kref)); kref_get(&ctrl->kref); } @@ -42,7 +42,7 @@ static void amp_ctrl_destroy(struct kref *kref) int amp_ctrl_put(struct amp_ctrl *ctrl) { BT_DBG("ctrl %p orig refcnt %d", ctrl, - atomic_read(&ctrl->kref.refcount)); + kref_read(&ctrl->kref)); return kref_put(&ctrl->kref, &_ctrl_destroy); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e17aacbc5630..0b4dba08a14e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4749,7 +4749,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, case LE_ADV_SCAN_RSP: break; default: - BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x", + BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", type); return; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 48f9471e7c85..f64d6566021f 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock) if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { - /* When releasing an user channel exclusive access, + /* When releasing a user channel exclusive access, * call hci_dev_do_close directly instead of calling * hci_dev_close to ensure the exclusive access will * be released and the controller brought back down. @@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, /* In case the transport is already up and * running, clear the error here. * - * This can happen when opening an user + * This can happen when opening a user * channel and HCI_AUTO_OFF grace period * is still active. */ @@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (!hci_sock_gen_cookie(sk)) { /* In the case when a cookie has already been assigned, * this socket will transition from a raw socket into - * an user channel socket. For a clean transition, send + * a user channel socket. For a clean transition, send * the close notification first. */ skb = create_monitor_ctrl_close(sk); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ce0b5dd01953..fc7f321a3823 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -481,14 +481,14 @@ static void l2cap_chan_destroy(struct kref *kref) void l2cap_chan_hold(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); + BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); kref_get(&c->kref); } void l2cap_chan_put(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount)); + BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); kref_put(&c->kref, l2cap_chan_destroy); } diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 0aefc011b668..40b1ede527ca 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o br_netlink.o + br_stp_if.o br_stp_timer.o br_netlink.o \ + br_netlink_tunnel.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o @@ -18,7 +19,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o -bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o +bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed3b3192fb00..ea71513fca21 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -79,7 +79,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_multicast_flood(mdst, skb, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); - } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { + } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { br_forward(dst->dst, skb, false, true); } else { br_flood(br, skb, BR_PKT_UNICAST, false, true); @@ -153,8 +153,8 @@ static int br_dev_stop(struct net_device *dev) return 0; } -static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void br_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct net_bridge *br = netdev_priv(dev); struct pcpu_sw_netstats tmp, sum = { 0 }; @@ -178,8 +178,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, stats->tx_packets = sum.tx_packets; stats->rx_bytes = sum.rx_bytes; stats->rx_packets = sum.rx_packets; - - return stats; } static int br_change_mtu(struct net_device *dev, int new_mtu) @@ -349,8 +347,6 @@ static const struct net_device_ops br_netdev_ops = { .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, - .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, - .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, @@ -415,4 +411,5 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); + INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e4a4176171c9..4f598dc2d916 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -28,9 +28,6 @@ #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; -static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, - const unsigned char *addr, - __u16 vid); static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, @@ -68,7 +65,7 @@ static inline unsigned long hold_time(const struct net_bridge *br) static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { - return !fdb->is_static && + return !fdb->is_static && !fdb->added_by_external_learn && time_before_eq(fdb->updated + hold_time(br), jiffies); } @@ -86,6 +83,47 @@ static void fdb_rcu_free(struct rcu_head *head) kmem_cache_free(br_fdb_cache, ent); } +static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, + const unsigned char *addr, + __u16 vid) +{ + struct net_bridge_fdb_entry *f; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hlist_for_each_entry_rcu(f, head, hlist) + if (ether_addr_equal(f->addr.addr, addr) && f->vlan_id == vid) + break; + + return f; +} + +/* requires bridge hash_lock */ +static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, + const unsigned char *addr, + __u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *fdb; + + WARN_ON_ONCE(!br_hash_lock_held(br)); + + rcu_read_lock(); + fdb = fdb_find_rcu(head, addr, vid); + rcu_read_unlock(); + + return fdb; +} + +struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, + const unsigned char *addr, + __u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + + return fdb_find_rcu(head, addr, vid); +} + /* When a static FDB entry is added, the mac address from the entry is * added to the bridge private HW address list and all required ports * are then updated with the new information. @@ -154,7 +192,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) if (f->added_by_external_learn) fdb_del_external_learn(f); - hlist_del_rcu(&f->hlist); + hlist_del_init_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } @@ -198,11 +236,10 @@ void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *f; spin_lock_bh(&br->hash_lock); - f = fdb_find(head, addr, vid); + f = br_fdb_find(br, addr, vid); if (f && f->is_local && !f->added_by_user && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); @@ -266,7 +303,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) spin_lock_bh(&br->hash_lock); /* If old entry was unassociated with any port, then delete it. */ - f = __br_fdb_get(br, br->dev->dev_addr, 0); + f = br_fdb_find(br, br->dev->dev_addr, 0); if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); @@ -281,7 +318,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - f = __br_fdb_get(br, br->dev->dev_addr, v->vid); + f = br_fdb_find(br, br->dev->dev_addr, v->vid); if (f && f->is_local && !f->dst && !f->added_by_user) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, v->vid); @@ -290,34 +327,43 @@ out: spin_unlock_bh(&br->hash_lock); } -void br_fdb_cleanup(unsigned long _data) +void br_fdb_cleanup(struct work_struct *work) { - struct net_bridge *br = (struct net_bridge *)_data; + struct net_bridge *br = container_of(work, struct net_bridge, + gc_work.work); unsigned long delay = hold_time(br); - unsigned long next_timer = jiffies + br->ageing_time; + unsigned long work_delay = delay; + unsigned long now = jiffies; int i; - spin_lock(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; struct hlist_node *n; + if (!br->hash[i].first) + continue; + + spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { unsigned long this_timer; + if (f->is_static) continue; if (f->added_by_external_learn) continue; this_timer = f->updated + delay; - if (time_before_eq(this_timer, jiffies)) + if (time_after(this_timer, now)) + work_delay = min(work_delay, this_timer - now); + else fdb_delete(br, f); - else if (time_before(this_timer, next_timer)) - next_timer = this_timer; } + spin_unlock_bh(&br->hash_lock); + cond_resched(); } - spin_unlock(&br->hash_lock); - mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); + /* Cleanup minimum 10 milliseconds apart */ + work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); + mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } /* Completely flush all dynamic entries in forwarding database.*/ @@ -371,26 +417,6 @@ void br_fdb_delete_by_port(struct net_bridge *br, spin_unlock_bh(&br->hash_lock); } -/* No locking or refcounting, assumes caller has rcu_read_lock */ -struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, - const unsigned char *addr, - __u16 vid) -{ - struct net_bridge_fdb_entry *fdb; - - hlist_for_each_entry_rcu(fdb, - &br->hash[br_mac_hash(addr, vid)], hlist) { - if (ether_addr_equal(fdb->addr.addr, addr) && - fdb->vlan_id == vid) { - if (unlikely(has_expired(br, fdb))) - break; - return fdb; - } - } - - return NULL; -} - #if IS_ENABLED(CONFIG_ATM_LANE) /* Interface used by ATM LANE hook to test * if an addr is on some other bridge port */ @@ -405,7 +431,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) if (!port) ret = 0; else { - fdb = __br_fdb_get(port->br, addr, 0); + fdb = br_fdb_find_rcu(port->br, addr, 0); ret = fdb && fdb->dst && fdb->dst->dev != dev && fdb->dst->state == BR_STATE_FORWARDING; } @@ -467,34 +493,6 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, return num; } -static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, - const unsigned char *addr, - __u16 vid) -{ - struct net_bridge_fdb_entry *fdb; - - hlist_for_each_entry(fdb, head, hlist) { - if (ether_addr_equal(fdb->addr.addr, addr) && - fdb->vlan_id == vid) - return fdb; - } - return NULL; -} - -static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, - const unsigned char *addr, - __u16 vid) -{ - struct net_bridge_fdb_entry *fdb; - - hlist_for_each_entry_rcu(fdb, head, hlist) { - if (ether_addr_equal(fdb->addr.addr, addr) && - fdb->vlan_id == vid) - return fdb; - } - return NULL; -} - static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, struct net_bridge_port *source, const unsigned char *addr, @@ -528,7 +526,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, if (!is_valid_ether_addr(addr)) return -EINVAL; - fdb = fdb_find(head, addr, vid); + fdb = br_fdb_find(br, addr, vid); if (fdb) { /* it is okay to have multiple ports with same * address, just use the first one. @@ -585,12 +583,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); } else { + unsigned long now = jiffies; + /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst)) { fdb->dst = source; fdb_modified = true; } - fdb->updated = jiffies; + if (now != fdb->updated) + fdb->updated = now; if (unlikely(added_by_user)) fdb->added_by_user = 1; if (unlikely(fdb_modified)) @@ -598,7 +599,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } else { spin_lock(&br->hash_lock); - if (likely(!fdb_find(head, addr, vid))) { + if (likely(!fdb_find_rcu(head, addr, vid))) { fdb = fdb_create(head, source, addr, vid, 0, 0); if (fdb) { if (unlikely(added_by_user)) @@ -782,7 +783,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, return -EINVAL; } - fdb = fdb_find(head, addr, vid); + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; @@ -929,55 +930,30 @@ out: return err; } -static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, - u16 vid) -{ - struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; - struct net_bridge_fdb_entry *fdb; - - fdb = fdb_find(head, addr, vid); - if (!fdb) - return -ENOENT; - - fdb_delete(br, fdb); - return 0; -} - -static int __br_fdb_delete_by_addr(struct net_bridge *br, - const unsigned char *addr, u16 vid) -{ - int err; - - spin_lock_bh(&br->hash_lock); - err = fdb_delete_by_addr(br, addr, vid); - spin_unlock_bh(&br->hash_lock); - - return err; -} - -static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, +static int fdb_delete_by_addr_and_port(struct net_bridge *br, + const struct net_bridge_port *p, const u8 *addr, u16 vlan) { - struct net_bridge *br = p->br; - struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; - fdb = fdb_find(head, addr, vlan); + fdb = br_fdb_find(br, addr, vlan); if (!fdb || fdb->dst != p) return -ENOENT; fdb_delete(br, fdb); + return 0; } -static int __br_fdb_delete(struct net_bridge_port *p, +static int __br_fdb_delete(struct net_bridge *br, + const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { int err; - spin_lock_bh(&p->br->hash_lock); - err = fdb_delete_by_addr_and_port(p, addr, vid); - spin_unlock_bh(&p->br->hash_lock); + spin_lock_bh(&br->hash_lock); + err = fdb_delete_by_addr_and_port(br, p, addr, vid); + spin_unlock_bh(&br->hash_lock); return err; } @@ -990,7 +966,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; - struct net_bridge *br = NULL; + struct net_bridge *br; int err; if (dev->priv_flags & IFF_EBRIDGE) { @@ -1004,6 +980,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } vg = nbp_vlan_group(p); + br = p->br; } if (vid) { @@ -1013,30 +990,20 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (dev->priv_flags & IFF_EBRIDGE) - err = __br_fdb_delete_by_addr(br, addr, vid); - else - err = __br_fdb_delete(p, addr, vid); + err = __br_fdb_delete(br, p, addr, vid); } else { err = -ENOENT; - if (dev->priv_flags & IFF_EBRIDGE) - err = __br_fdb_delete_by_addr(br, addr, 0); - else - err &= __br_fdb_delete(p, addr, 0); - + err &= __br_fdb_delete(br, p, addr, 0); if (!vg || !vg->num_vlans) - goto out; + return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - if (dev->priv_flags & IFF_EBRIDGE) - err = __br_fdb_delete_by_addr(br, addr, v->vid); - else - err &= __br_fdb_delete(p, addr, v->vid); + err &= __br_fdb_delete(br, p, addr, v->vid); } } -out: + return err; } @@ -1107,7 +1074,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, spin_lock_bh(&br->hash_lock); head = &br->hash[br_mac_hash(addr, vid)]; - fdb = fdb_find(head, addr, vid); + fdb = br_fdb_find(br, addr, vid); if (!fdb) { fdb = fdb_create(head, p, addr, vid, 0, 0); if (!fdb) { @@ -1135,15 +1102,13 @@ err_unlock: int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid) { - struct hlist_head *head; struct net_bridge_fdb_entry *fdb; int err = 0; ASSERT_RTNL(); spin_lock_bh(&br->hash_lock); - head = &br->hash[br_mac_hash(addr, vid)]; - fdb = fdb_find(head, addr, vid); + fdb = br_fdb_find(br, addr, vid); if (fdb && fdb->added_by_external_learn) fdb_delete(br, fdb); else diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7cb41aee4c82..902af6ba481c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -80,7 +80,7 @@ static void __br_forward(const struct net_bridge_port *to, int br_hook; vg = nbp_vlan_group_rcu(to); - skb = br_handle_vlan(to->br, vg, skb); + skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) return; @@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + /* Do not flood if mc off, except for traffic we originate */ if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD)) + !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; /* Do not flood to ports that enable proxy ARP */ @@ -220,6 +221,31 @@ out: } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING +static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, + const unsigned char *addr, bool local_orig) +{ + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + const unsigned char *src = eth_hdr(skb)->h_source; + + if (!should_deliver(p, skb)) + return; + + /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */ + if (skb->dev == p->dev && ether_addr_equal(src, addr)) + return; + + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) { + dev->stats.tx_dropped++; + return; + } + + if (!is_broadcast_ether_addr(addr)) + memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN); + + __br_forward(p, skb, local_orig); +} + /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, @@ -241,10 +267,20 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : NULL; - port = (unsigned long)lport > (unsigned long)rport ? - lport : rport; + if ((unsigned long)lport > (unsigned long)rport) { + port = lport; + + if (port->flags & BR_MULTICAST_TO_UNICAST) { + maybe_deliver_addr(lport, skb, p->eth_addr, + local_orig); + goto delivered; + } + } else { + port = rport; + } prev = maybe_deliver(prev, port, skb, local_orig); +delivered: if (IS_ERR(prev)) goto out; if (prev == port) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ed0dd3340084..8ac1770aa222 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_vlan_flush(br); br_multicast_dev_del(br); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 855b72fbe1da..236f34244dbe 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -21,6 +21,7 @@ #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" +#include "br_private_tunnel.h" /* Hook for brouter */ br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; @@ -57,7 +58,7 @@ static int br_pass_frame_up(struct sk_buff *skb) indev = skb->dev; skb->dev = brdev; - skb = br_handle_vlan(br, vg, skb); + skb = br_handle_vlan(br, NULL, vg, skb); if (!skb) return NET_RX_DROP; /* update the multicast stats if the packet is IGMP/MLD */ @@ -113,7 +114,7 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, return; } - f = __br_fdb_get(br, n->ha, vid); + f = br_fdb_find_rcu(br, n->ha, vid); if (f && ((p->flags & BR_PROXYARP) || (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, @@ -188,16 +189,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } break; case BR_PKT_UNICAST: - dst = __br_fdb_get(br, dest, vid); + dst = br_fdb_find_rcu(br, dest, vid); default: break; } if (dst) { + unsigned long now = jiffies; + if (dst->is_local) return br_pass_frame_up(skb); - dst->used = jiffies; + if (now != dst->used) + dst->used = now; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) @@ -261,6 +265,11 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; p = br_port_get_rcu(skb->dev); + if (p->flags & BR_VLAN_TUNNEL) { + if (br_handle_ingress_vlan_tunnel(skb, p, + nbp_vlan_group_rcu(p))) + goto drop; + } if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index da8157c57eb1..7970f8540cbb 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); - b.gc_timer_value = br_timer_value(&br->gc_timer); + b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 7dbc80d01eb0..056e6ac49d8f 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -531,7 +531,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } - p = br_multicast_new_port_group(port, group, *pp, state); + p = br_multicast_new_port_group(port, group, *pp, state, NULL); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b30e77e8427c..b760f2620abf 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -27,6 +27,7 @@ #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> +#include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/mld.h> @@ -43,12 +44,15 @@ static void br_multicast_add_router(struct net_bridge *br, static void br_ip4_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, - __u16 vid); + __u16 vid, + const unsigned char *src); + +static void __del_port_router(struct net_bridge_port *p); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, - __u16 vid); + __u16 vid, const unsigned char *src); #endif unsigned int br_mdb_rehash_seq; @@ -540,7 +544,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, break; case 2: mld2q = (struct mld2_query *)icmp6_hdr(skb); - mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval)); + mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval)); mld2q->mld2q_type = ICMPV6_MGM_QUERY; mld2q->mld2q_code = 0; mld2q->mld2q_cksum = 0; @@ -711,7 +715,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char flags) + unsigned char flags, + const unsigned char *src) { struct net_bridge_port_group *p; @@ -726,12 +731,32 @@ struct net_bridge_port_group *br_multicast_new_port_group( hlist_add_head(&p->mglist, &port->mglist); setup_timer(&p->timer, br_multicast_port_group_expired, (unsigned long)p); + + if (src) + memcpy(p->eth_addr, src, ETH_ALEN); + else + memset(p->eth_addr, 0xff, ETH_ALEN); + return p; } +static bool br_port_group_equal(struct net_bridge_port_group *p, + struct net_bridge_port *port, + const unsigned char *src) +{ + if (p->port != port) + return false; + + if (!(port->flags & BR_MULTICAST_TO_UNICAST)) + return true; + + return ether_addr_equal(src, p->eth_addr); +} + static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group) + struct br_ip *group, + const unsigned char *src) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; @@ -758,13 +783,13 @@ static int br_multicast_add_group(struct net_bridge *br, for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { - if (p->port == port) + if (br_port_group_equal(p, port, src)) goto found; if ((unsigned long)p->port < (unsigned long)port) break; } - p = br_multicast_new_port_group(port, group, *pp, 0); + p = br_multicast_new_port_group(port, group, *pp, 0, src); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); @@ -783,7 +808,8 @@ err: static int br_ip4_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, - __u16 vid) + __u16 vid, + const unsigned char *src) { struct br_ip br_group; @@ -794,14 +820,15 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - return br_multicast_add_group(br, port, &br_group); + return br_multicast_add_group(br, port, &br_group, src); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, - __u16 vid) + __u16 vid, + const unsigned char *src) { struct br_ip br_group; @@ -812,7 +839,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - return br_multicast_add_group(br, port, &br_group); + return br_multicast_add_group(br, port, &br_group, src); } #endif @@ -824,16 +851,10 @@ static void br_multicast_router_expired(unsigned long data) spin_lock(&br->multicast_lock); if (port->multicast_router == MDB_RTR_TYPE_DISABLED || port->multicast_router == MDB_RTR_TYPE_PERM || - timer_pending(&port->multicast_router_timer) || - hlist_unhashed(&port->rlist)) + timer_pending(&port->multicast_router_timer)) goto out; - hlist_del_init_rcu(&port->rlist); - br_rtr_notify(br->dev, port, RTM_DELMDB); - /* Don't allow timer refresh if the router expired */ - if (port->multicast_router == MDB_RTR_TYPE_TEMP) - port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - + __del_port_router(port); out: spin_unlock(&br->multicast_lock); } @@ -982,6 +1003,18 @@ static void br_ip6_multicast_port_query_expired(unsigned long data) } #endif +static void br_mc_disabled_update(struct net_device *dev, bool value) +{ + struct switchdev_attr attr = { + .orig_dev = dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, + .flags = SWITCHDEV_F_DEFER, + .u.mc_disabled = value, + }; + + switchdev_port_attr_set(dev, &attr); +} + int br_multicast_add_port(struct net_bridge_port *port) { port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; @@ -994,6 +1027,8 @@ int br_multicast_add_port(struct net_bridge_port *port) setup_timer(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, (unsigned long)port); #endif + br_mc_disabled_update(port->dev, port->br->multicast_disabled); + port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) return -ENOMEM; @@ -1061,13 +1096,8 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(br, pg); - if (!hlist_unhashed(&port->rlist)) { - hlist_del_init_rcu(&port->rlist); - br_rtr_notify(br->dev, port, RTM_DELMDB); - /* Don't allow timer refresh if disabling */ - if (port->multicast_router == MDB_RTR_TYPE_TEMP) - port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - } + __del_port_router(port); + del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) @@ -1081,6 +1111,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + const unsigned char *src; struct igmpv3_report *ih; struct igmpv3_grec *grec; int i; @@ -1121,12 +1152,14 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, continue; } + src = eth_hdr(skb)->h_source; if ((type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE) && ntohs(grec->grec_nsrcs) == 0) { - br_ip4_multicast_leave_group(br, port, group, vid); + br_ip4_multicast_leave_group(br, port, group, vid, src); } else { - err = br_ip4_multicast_add_group(br, port, group, vid); + err = br_ip4_multicast_add_group(br, port, group, vid, + src); if (err) break; } @@ -1141,6 +1174,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, struct sk_buff *skb, u16 vid) { + const unsigned char *src; struct icmp6hdr *icmp6h; struct mld2_grec *grec; int i; @@ -1188,14 +1222,16 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, continue; } + src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && ntohs(*nsrcs) == 0) { br_ip6_multicast_leave_group(br, port, &grec->grec_mca, - vid); + vid, src); } else { err = br_ip6_multicast_add_group(br, port, - &grec->grec_mca, vid); + &grec->grec_mca, vid, + src); if (err) break; } @@ -1281,6 +1317,19 @@ br_multicast_update_query_timer(struct net_bridge *br, mod_timer(&query->timer, jiffies + br->multicast_querier_interval); } +static void br_port_mc_router_state_change(struct net_bridge_port *p, + bool is_mc_router) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_PORT_MROUTER, + .flags = SWITCHDEV_F_DEFER, + .u.mrouter = is_mc_router, + }; + + switchdev_port_attr_set(p->dev, &attr); +} + /* * Add port to router_list * list is maintained ordered by pointer value @@ -1306,6 +1355,7 @@ static void br_multicast_add_router(struct net_bridge *br, else hlist_add_head_rcu(&port->rlist, &br->router_list); br_rtr_notify(br->dev, port, RTM_NEWMDB); + br_port_mc_router_state_change(port, true); } static void br_multicast_mark_router(struct net_bridge *br, @@ -1511,7 +1561,8 @@ br_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *group, struct bridge_mcast_other_query *other_query, - struct bridge_mcast_own_query *own_query) + struct bridge_mcast_own_query *own_query, + const unsigned char *src) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -1535,7 +1586,7 @@ br_multicast_leave_group(struct net_bridge *br, for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { - if (p->port != port) + if (!br_port_group_equal(p, port, src)) continue; rcu_assign_pointer(*pp, p->next); @@ -1566,7 +1617,7 @@ br_multicast_leave_group(struct net_bridge *br, for (p = mlock_dereference(mp->ports, br); p != NULL; p = mlock_dereference(p->next, br)) { - if (p->port != port) + if (!br_port_group_equal(p, port, src)) continue; if (!hlist_unhashed(&p->mglist) && @@ -1617,7 +1668,8 @@ out: static void br_ip4_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, __be32 group, - __u16 vid) + __u16 vid, + const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; @@ -1632,14 +1684,15 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, br_group.vid = vid; br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, - own_query); + own_query, src); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge *br, struct net_bridge_port *port, const struct in6_addr *group, - __u16 vid) + __u16 vid, + const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; @@ -1654,7 +1707,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, br_group.vid = vid; br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, - own_query); + own_query, src); } #endif @@ -1712,6 +1765,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, u16 vid) { struct sk_buff *skb_trimmed = NULL; + const unsigned char *src; struct igmphdr *ih; int err; @@ -1731,13 +1785,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } ih = igmp_hdr(skb); + src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group, vid); + err = br_ip4_multicast_add_group(br, port, ih->group, vid, src); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); @@ -1746,7 +1801,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: - br_ip4_multicast_leave_group(br, port, ih->group, vid); + br_ip4_multicast_leave_group(br, port, ih->group, vid, src); break; } @@ -1766,6 +1821,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, u16 vid) { struct sk_buff *skb_trimmed = NULL; + const unsigned char *src; struct mld_msg *mld; int err; @@ -1785,8 +1841,10 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, switch (mld->mld_type) { case ICMPV6_MGM_REPORT: + src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); + err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, + src); break; case ICMPV6_MLD2_REPORT: err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); @@ -1795,7 +1853,8 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); + src = eth_hdr(skb)->h_source; + br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src); break; } @@ -2004,6 +2063,11 @@ static void __del_port_router(struct net_bridge_port *p) return; hlist_del_init_rcu(&p->rlist); br_rtr_notify(p->br->dev, p, RTM_DELMDB); + br_port_mc_router_state_change(p, false); + + /* don't allow timer refresh */ + if (p->multicast_router == MDB_RTR_TYPE_TEMP) + p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) @@ -2081,6 +2145,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) if (br->multicast_disabled == !val) goto unlock; + br_mc_disabled_update(br->dev, !val); br->multicast_disabled = !val; if (br->multicast_disabled) goto unlock; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 8ca6a929bf12..95087e6e8258 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -399,7 +399,7 @@ bridged_dnat: br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish); + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 71c7453268c1..a8f6acd23e30 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -20,6 +20,7 @@ #include "br_private.h" #include "br_private_stp.h" +#include "br_private_tunnel.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) @@ -95,9 +96,10 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { struct net_bridge_vlan_group *vg = NULL; - struct net_bridge_port *p; + struct net_bridge_port *p = NULL; struct net_bridge *br; int num_vlan_infos; + size_t vinfo_sz = 0; rcu_read_lock(); if (br_port_exists(dev)) { @@ -110,8 +112,13 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); + if (p && (p->flags & BR_VLAN_TUNNEL)) + vinfo_sz += br_get_vlan_tunnel_info_size(vg); + /* Each VLAN is returned in bridge_vlan_info along with flags */ - return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); + vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); + + return vinfo_sz; } static inline size_t br_port_info_size(void) @@ -123,10 +130,12 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -173,6 +182,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || + nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST, + !!(p->flags & BR_MULTICAST_TO_UNICAST)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || @@ -191,7 +202,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, p->topology_change_ack) || - nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending)) + nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || + nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & + BR_VLAN_TUNNEL))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -414,6 +427,9 @@ static int br_fill_ifinfo(struct sk_buff *skb, err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); + + if (port && (port->flags & BR_VLAN_TUNNEL)) + err = br_fill_vlan_tunnel_info(skb, vg); rcu_read_unlock(); if (err) goto nla_put_failure; @@ -514,60 +530,88 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, return err; } +static int br_process_vlan_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct bridge_vlan_info *vinfo_curr, + struct bridge_vlan_info **vinfo_last) +{ + if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK) + return -EINVAL; + + if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + /* check if we are already processing a range */ + if (*vinfo_last) + return -EINVAL; + *vinfo_last = vinfo_curr; + /* don't allow range of pvids */ + if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID) + return -EINVAL; + return 0; + } + + if (*vinfo_last) { + struct bridge_vlan_info tmp_vinfo; + int v, err; + + if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END)) + return -EINVAL; + + if (vinfo_curr->vid <= (*vinfo_last)->vid) + return -EINVAL; + + memcpy(&tmp_vinfo, *vinfo_last, + sizeof(struct bridge_vlan_info)); + for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { + tmp_vinfo.vid = v; + err = br_vlan_info(br, p, cmd, &tmp_vinfo); + if (err) + break; + } + *vinfo_last = NULL; + + return 0; + } + + return br_vlan_info(br, p, cmd, vinfo_curr); +} + static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, int cmd) { - struct bridge_vlan_info *vinfo_start = NULL; - struct bridge_vlan_info *vinfo = NULL; + struct bridge_vlan_info *vinfo_curr = NULL; + struct bridge_vlan_info *vinfo_last = NULL; struct nlattr *attr; - int err = 0; - int rem; + struct vtunnel_info tinfo_last = {}; + struct vtunnel_info tinfo_curr = {}; + int err = 0, rem; nla_for_each_nested(attr, af_spec, rem) { - if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) - continue; - if (nla_len(attr) != sizeof(struct bridge_vlan_info)) - return -EINVAL; - vinfo = nla_data(attr); - if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) - return -EINVAL; - if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { - if (vinfo_start) + err = 0; + switch (nla_type(attr)) { + case IFLA_BRIDGE_VLAN_TUNNEL_INFO: + if (!(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; - vinfo_start = vinfo; - /* don't allow range of pvids */ - if (vinfo_start->flags & BRIDGE_VLAN_INFO_PVID) - return -EINVAL; - continue; - } - - if (vinfo_start) { - struct bridge_vlan_info tmp_vinfo; - int v; - - if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END)) - return -EINVAL; - - if (vinfo->vid <= vinfo_start->vid) + err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); + if (err) + return err; + err = br_process_vlan_tunnel_info(br, p, cmd, + &tinfo_curr, + &tinfo_last); + if (err) + return err; + break; + case IFLA_BRIDGE_VLAN_INFO: + if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; - - memcpy(&tmp_vinfo, vinfo_start, - sizeof(struct bridge_vlan_info)); - - for (v = vinfo_start->vid; v <= vinfo->vid; v++) { - tmp_vinfo.vid = v; - err = br_vlan_info(br, p, cmd, &tmp_vinfo); - if (err) - break; - } - vinfo_start = NULL; - } else { - err = br_vlan_info(br, p, cmd, vinfo); - } - if (err) + vinfo_curr = nla_data(attr); + err = br_process_vlan_info(br, p, cmd, vinfo_curr, + &vinfo_last); + if (err) + return err; break; + } } return err; @@ -586,6 +630,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, + [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -626,8 +671,9 @@ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], /* Process bridge protocol info on port */ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) { - int err; unsigned long old_flags = p->flags; + bool br_vlan_tunnel_old = false; + int err; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); @@ -636,9 +682,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); + br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false; + br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); + if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) + nbp_vlan_tunnel_info_flush(p); + if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) @@ -781,20 +833,6 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int br_dev_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -{ - struct net_bridge *br = netdev_priv(dev); - - if (tb[IFLA_ADDRESS]) { - spin_lock_bh(&br->lock); - br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); - spin_unlock_bh(&br->lock); - } - - return register_netdevice(dev); -} - static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], @@ -1115,6 +1153,25 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return 0; } +static int br_dev_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(dev); + int err; + + if (tb[IFLA_ADDRESS]) { + spin_lock_bh(&br->lock); + br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); + spin_unlock_bh(&br->lock); + } + + err = br_changelink(dev, tb, data); + if (err) + return err; + + return register_netdevice(dev); +} + static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ @@ -1190,7 +1247,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = br_timer_value(&br->gc_timer); + clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c new file mode 100644 index 000000000000..c913491495ab --- /dev/null +++ b/net/bridge/br_netlink_tunnel.c @@ -0,0 +1,294 @@ +/* + * Bridge per vlan tunnel port dst_metadata netlink control interface + * + * Authors: + * Roopa Prabhu <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/etherdevice.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <uapi/linux/if_bridge.h> +#include <net/dst_metadata.h> + +#include "br_private.h" +#include "br_private_tunnel.h" + +static size_t __get_vlan_tinfo_size(void) +{ + return nla_total_size(0) + /* nest IFLA_BRIDGE_VLAN_TUNNEL_INFO */ + nla_total_size(sizeof(u32)) + /* IFLA_BRIDGE_VLAN_TUNNEL_ID */ + nla_total_size(sizeof(u16)) + /* IFLA_BRIDGE_VLAN_TUNNEL_VID */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */ +} + +static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr, + struct net_bridge_vlan *v_last) +{ + __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id); + __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id); + + return (be32_to_cpu(tunid_curr) - be32_to_cpu(tunid_last)) == 1; +} + +static int __get_num_vlan_tunnel_infos(struct net_bridge_vlan_group *vg) +{ + struct net_bridge_vlan *v, *vtbegin = NULL, *vtend = NULL; + int num_tinfos = 0; + + /* Count number of vlan infos */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + /* only a context, bridge vlan not activated */ + if (!br_vlan_should_use(v) || !v->tinfo.tunnel_id) + continue; + + if (!vtbegin) { + goto initvars; + } else if ((v->vid - vtend->vid) == 1 && + vlan_tunid_inrange(v, vtend)) { + vtend = v; + continue; + } else { + if ((vtend->vid - vtbegin->vid) > 0) + num_tinfos += 2; + else + num_tinfos += 1; + } +initvars: + vtbegin = v; + vtend = v; + } + + if (vtbegin && vtend) { + if ((vtend->vid - vtbegin->vid) > 0) + num_tinfos += 2; + else + num_tinfos += 1; + } + + return num_tinfos; +} + +int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg) +{ + int num_tinfos; + + if (!vg) + return 0; + + rcu_read_lock(); + num_tinfos = __get_num_vlan_tunnel_infos(vg); + rcu_read_unlock(); + + return num_tinfos * __get_vlan_tinfo_size(); +} + +static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid, + __be64 tunnel_id, u16 flags) +{ + __be32 tid = tunnel_id_to_key32(tunnel_id); + struct nlattr *tmap; + + tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO); + if (!tmap) + return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID, + be32_to_cpu(tid))) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_VID, + vid)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_BRIDGE_VLAN_TUNNEL_FLAGS, + flags)) + goto nla_put_failure; + nla_nest_end(skb, tmap); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, tmap); + + return -EMSGSIZE; +} + +static int br_fill_vlan_tinfo_range(struct sk_buff *skb, + struct net_bridge_vlan *vtbegin, + struct net_bridge_vlan *vtend) +{ + int err; + + if (vtend && (vtend->vid - vtbegin->vid) > 0) { + /* add range to skb */ + err = br_fill_vlan_tinfo(skb, vtbegin->vid, + vtbegin->tinfo.tunnel_id, + BRIDGE_VLAN_INFO_RANGE_BEGIN); + if (err) + return err; + + err = br_fill_vlan_tinfo(skb, vtend->vid, + vtend->tinfo.tunnel_id, + BRIDGE_VLAN_INFO_RANGE_END); + if (err) + return err; + } else { + err = br_fill_vlan_tinfo(skb, vtbegin->vid, + vtbegin->tinfo.tunnel_id, + 0); + if (err) + return err; + } + + return 0; +} + +int br_fill_vlan_tunnel_info(struct sk_buff *skb, + struct net_bridge_vlan_group *vg) +{ + struct net_bridge_vlan *vtbegin = NULL; + struct net_bridge_vlan *vtend = NULL; + struct net_bridge_vlan *v; + int err; + + /* Count number of vlan infos */ + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { + /* only a context, bridge vlan not activated */ + if (!br_vlan_should_use(v)) + continue; + + if (!v->tinfo.tunnel_dst) + continue; + + if (!vtbegin) { + goto initvars; + } else if ((v->vid - vtend->vid) == 1 && + vlan_tunid_inrange(v, vtend)) { + vtend = v; + continue; + } else { + err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); + if (err) + return err; + } +initvars: + vtbegin = v; + vtend = v; + } + + if (vtbegin) { + err = br_fill_vlan_tinfo_range(skb, vtbegin, vtend); + if (err) + return err; + } + + return 0; +} + +static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = { + [IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 }, + [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, +}; + +static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id) +{ + int err = 0; + + if (!p) + return -EINVAL; + + switch (cmd) { + case RTM_SETLINK: + err = nbp_vlan_tunnel_info_add(p, vid, tun_id); + break; + case RTM_DELLINK: + nbp_vlan_tunnel_info_delete(p, vid); + break; + } + + return err; +} + +int br_parse_vlan_tunnel_info(struct nlattr *attr, + struct vtunnel_info *tinfo) +{ + struct nlattr *tb[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1]; + u32 tun_id; + u16 vid, flags = 0; + int err; + + memset(tinfo, 0, sizeof(*tinfo)); + + err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, + attr, vlan_tunnel_policy); + if (err < 0) + return err; + + if (!tb[IFLA_BRIDGE_VLAN_TUNNEL_ID] || + !tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]) + return -EINVAL; + + tun_id = nla_get_u32(tb[IFLA_BRIDGE_VLAN_TUNNEL_ID]); + vid = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_VID]); + if (vid >= VLAN_VID_MASK) + return -ERANGE; + + if (tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]) + flags = nla_get_u16(tb[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS]); + + tinfo->tunid = tun_id; + tinfo->vid = vid; + tinfo->flags = flags; + + return 0; +} + +int br_process_vlan_tunnel_info(struct net_bridge *br, + struct net_bridge_port *p, int cmd, + struct vtunnel_info *tinfo_curr, + struct vtunnel_info *tinfo_last) +{ + int err; + + if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + if (tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) + return -EINVAL; + memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info)); + } else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) { + int t, v; + + if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN)) + return -EINVAL; + if ((tinfo_curr->vid - tinfo_last->vid) != + (tinfo_curr->tunid - tinfo_last->tunid)) + return -EINVAL; + t = tinfo_last->tunid; + for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) { + err = br_vlan_tunnel_info(p, cmd, v, t); + if (err) + return err; + t++; + } + memset(tinfo_last, 0, sizeof(struct vtunnel_info)); + memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); + } else { + if (tinfo_last->flags) + return -EINVAL; + err = br_vlan_tunnel_info(p, cmd, tinfo_curr->vid, + tinfo_curr->tunid); + if (err) + return err; + memset(tinfo_last, 0, sizeof(struct vtunnel_info)); + memset(tinfo_curr, 0, sizeof(struct vtunnel_info)); + } + + return 0; +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8ce621e8345c..2288fca7756c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -91,6 +91,11 @@ struct br_vlan_stats { struct u64_stats_sync syncp; }; +struct br_tunnel_info { + __be64 tunnel_id; + struct metadata_dst *tunnel_dst; +}; + /** * struct net_bridge_vlan - per-vlan entry * @@ -113,6 +118,7 @@ struct br_vlan_stats { */ struct net_bridge_vlan { struct rhash_head vnode; + struct rhash_head tnode; u16 vid; u16 flags; struct br_vlan_stats __percpu *stats; @@ -124,6 +130,9 @@ struct net_bridge_vlan { atomic_t refcnt; struct net_bridge_vlan *brvlan; }; + + struct br_tunnel_info tinfo; + struct list_head vlist; struct rcu_head rcu; @@ -145,24 +154,27 @@ struct net_bridge_vlan { */ struct net_bridge_vlan_group { struct rhashtable vlan_hash; + struct rhashtable tunnel_hash; struct list_head vlan_list; u16 num_vlans; u16 pvid; }; -struct net_bridge_fdb_entry -{ +struct net_bridge_fdb_entry { struct hlist_node hlist; struct net_bridge_port *dst; - unsigned long updated; - unsigned long used; mac_addr addr; __u16 vlan_id; unsigned char is_local:1, is_static:1, added_by_user:1, added_by_external_learn:1; + + /* write-heavy members should not affect lookups */ + unsigned long updated ____cacheline_aligned_in_smp; + unsigned long used; + struct rcu_head rcu; }; @@ -177,6 +189,7 @@ struct net_bridge_port_group { struct timer_list timer; struct br_ip addr; unsigned char flags; + unsigned char eth_addr[ETH_ALEN]; }; struct net_bridge_mdb_entry @@ -201,12 +214,16 @@ struct net_bridge_mdb_htable u32 ver; }; -struct net_bridge_port -{ +struct net_bridge_port { struct net_bridge *br; struct net_device *dev; struct list_head list; + unsigned long flags; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + /* STP */ u8 priority; u8 state; @@ -227,8 +244,6 @@ struct net_bridge_port struct kobject kobj; struct rcu_head rcu; - unsigned long flags; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) @@ -248,9 +263,6 @@ struct net_bridge_port #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; -#endif #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif @@ -272,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device * rtnl_dereference(dev->rx_handler_data) : NULL; } -struct net_bridge -{ +struct net_bridge { spinlock_t lock; + spinlock_t hash_lock; struct list_head port_list; struct net_device *dev; - struct pcpu_sw_netstats __percpu *stats; - spinlock_t hash_lock; + /* These fields are accessed on each packet */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + u8 vlan_enabled; + u8 vlan_stats_enabled; + __be16 vlan_proto; + u16 default_pvid; + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + struct hlist_head hash[BR_HASH_SIZE]; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { @@ -297,6 +316,9 @@ struct net_bridge bridge_id designated_root; bridge_id bridge_id; u32 root_path_cost; + unsigned char topology_change; + unsigned char topology_change_detected; + u16 root_port; unsigned long max_age; unsigned long hello_time; unsigned long forward_delay; @@ -308,7 +330,6 @@ struct net_bridge u8 group_addr[ETH_ALEN]; bool group_addr_set; - u16 root_port; enum { BR_NO_STP, /* no spanning tree */ @@ -316,9 +337,6 @@ struct net_bridge BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; - unsigned char topology_change; - unsigned char topology_change_detected; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING unsigned char multicast_router; @@ -363,21 +381,13 @@ struct net_bridge struct timer_list hello_timer; struct timer_list tcn_timer; struct timer_list topology_change_timer; - struct timer_list gc_timer; + struct delayed_work gc_work; struct kobject *ifobj; u32 auto_cnt; #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif - -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; - u8 vlan_enabled; - u8 vlan_stats_enabled; - __be16 vlan_proto; - u16 default_pvid; -#endif }; struct br_input_skb_cb { @@ -494,11 +504,12 @@ void br_fdb_find_delete_local(struct net_bridge *br, const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); -void br_fdb_cleanup(unsigned long arg); +void br_fdb_cleanup(struct work_struct *work); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); -struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, - const unsigned char *addr, __u16 vid); +struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, + const unsigned char *addr, + __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, unsigned long off); @@ -520,6 +531,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); +static inline bool br_hash_lock_held(struct net_bridge *br) +{ +#ifdef CONFIG_LOCKDEP + return lockdep_is_held(&br->hash_lock); +#else + return true; +#endif +} + /* br_forward.c */ enum br_pkt_type { BR_PKT_UNICAST, @@ -599,7 +619,7 @@ void br_multicast_free_pg(struct rcu_head *head); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, - unsigned char flags); + unsigned char flags, const unsigned char *src); void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, @@ -764,6 +784,7 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); @@ -863,6 +884,7 @@ static inline bool br_should_learn(struct net_bridge_port *p, } static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h new file mode 100644 index 000000000000..4a447a378ab3 --- /dev/null +++ b/net/bridge/br_private_tunnel.h @@ -0,0 +1,83 @@ +/* + * Bridge per vlan tunnels + * + * Authors: + * Roopa Prabhu <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _BR_PRIVATE_TUNNEL_H +#define _BR_PRIVATE_TUNNEL_H + +struct vtunnel_info { + u32 tunid; + u16 vid; + u16 flags; +}; + +/* br_netlink_tunnel.c */ +int br_parse_vlan_tunnel_info(struct nlattr *attr, + struct vtunnel_info *tinfo); +int br_process_vlan_tunnel_info(struct net_bridge *br, + struct net_bridge_port *p, + int cmd, + struct vtunnel_info *tinfo_curr, + struct vtunnel_info *tinfo_last); +int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg); +int br_fill_vlan_tunnel_info(struct sk_buff *skb, + struct net_bridge_vlan_group *vg); + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING +/* br_vlan_tunnel.c */ +int vlan_tunnel_init(struct net_bridge_vlan_group *vg); +void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg); +int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid); +int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id); +void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port); +void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan); +int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg); +int br_handle_egress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_vlan *vlan); +#else +static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg) +{ + return 0; +} + +static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, + u16 vid) +{ + return 0; +} + +static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, + u16 vid, u32 tun_id) +{ + return 0; +} + +static inline void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) +{ +} + +static inline void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan) +{ +} + +static inline int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) +{ + return 0; +} +#endif + +#endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 71fd1a4e63cc..8f56c2d1f1a7 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) br->ageing_time = t; spin_unlock_bh(&br->lock); - mod_timer(&br->gc_timer, jiffies); + mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 6c1e21411125..08341d2aa9c9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br) spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); - mod_timer(&br->gc_timer, jiffies + HZ/10); + mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); @@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br) del_timer_sync(&br->hello_timer); del_timer_sync(&br->topology_change_timer); del_timer_sync(&br->tcn_timer); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 7ddb38e0a06e..c98b3e5c140a 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br) setup_timer(&br->topology_change_timer, br_topology_change_timer_expired, (unsigned long) br); - - setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br); } void br_stp_port_timer_init(struct net_bridge_port *p) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index a18148213b08..0f4034934d56 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 8bd569695e76..05e8946ccc03 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -188,6 +188,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); +BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UNICAST); #endif static const struct brport_attribute *brport_attrs[] = { @@ -214,6 +215,7 @@ static const struct brport_attribute *brport_attrs[] = { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, + &brport_attr_multicast_to_unicast, #endif &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index b6de4f457161..b838213c408e 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -5,6 +5,7 @@ #include <net/switchdev.h> #include "br_private.h" +#include "br_private_tunnel.h" static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) @@ -310,6 +311,7 @@ static int __vlan_del(struct net_bridge_vlan *v) } if (masterv != v) { + vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); @@ -325,6 +327,7 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); + vlan_tunnel_deinit(vg); kfree(vg); } @@ -338,6 +341,7 @@ static void __vlan_flush(struct net_bridge_vlan_group *vg) } struct sk_buff *br_handle_vlan(struct net_bridge *br, + const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { @@ -378,6 +382,12 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) skb->vlan_tci = 0; + + if (p && (p->flags & BR_VLAN_TUNNEL) && + br_handle_egress_vlan_tunnel(skb, v)) { + kfree_skb(skb); + return NULL; + } out: return skb; } @@ -613,6 +623,8 @@ int br_vlan_delete(struct net_bridge *br, u16 vid) br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); + vlan_tunnel_info_del(vg, v); + return __vlan_del(v); } @@ -918,6 +930,9 @@ int br_vlan_init(struct net_bridge *br) ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; + ret = vlan_tunnel_init(vg); + if (ret) + goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; @@ -932,6 +947,8 @@ out: return ret; err_vlan_add: + vlan_tunnel_deinit(vg); +err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); @@ -961,6 +978,9 @@ int nbp_vlan_init(struct net_bridge_port *p) ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; + ret = vlan_tunnel_init(vg); + if (ret) + goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { @@ -976,9 +996,11 @@ out: err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); + vlan_tunnel_deinit(vg); +err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); -err_vlan_enabled: err_rhtbl: +err_vlan_enabled: kfree(vg); goto out; diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c new file mode 100644 index 000000000000..6d2c4eed2dc8 --- /dev/null +++ b/net/bridge/br_vlan_tunnel.c @@ -0,0 +1,205 @@ +/* + * Bridge per vlan tunnel port dst_metadata handling code + * + * Authors: + * Roopa Prabhu <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <net/switchdev.h> +#include <net/dst_metadata.h> + +#include "br_private.h" +#include "br_private_tunnel.h" + +static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct net_bridge_vlan *vle = ptr; + __be64 tunid = *(__be64 *)arg->key; + + return vle->tinfo.tunnel_id != tunid; +} + +static const struct rhashtable_params br_vlan_tunnel_rht_params = { + .head_offset = offsetof(struct net_bridge_vlan, tnode), + .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), + .key_len = sizeof(__be64), + .nelem_hint = 3, + .locks_mul = 1, + .obj_cmpfn = br_vlan_tunid_cmp, + .automatic_shrinking = true, +}; + +static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, + u64 tunnel_id) +{ + return rhashtable_lookup_fast(tbl, &tunnel_id, + br_vlan_tunnel_rht_params); +} + +void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan) +{ + if (!vlan->tinfo.tunnel_dst) + return; + rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, + br_vlan_tunnel_rht_params); + vlan->tinfo.tunnel_id = 0; + dst_release(&vlan->tinfo.tunnel_dst->dst); + vlan->tinfo.tunnel_dst = NULL; +} + +static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *vlan, u32 tun_id) +{ + struct metadata_dst *metadata = NULL; + __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); + int err; + + if (vlan->tinfo.tunnel_dst) + return -EEXIST; + + metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + key, 0); + if (!metadata) + return -EINVAL; + + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; + vlan->tinfo.tunnel_dst = metadata; + vlan->tinfo.tunnel_id = key; + + err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, + br_vlan_tunnel_rht_params); + if (err) + goto out; + + return 0; +out: + dst_release(&vlan->tinfo.tunnel_dst->dst); + vlan->tinfo.tunnel_dst = NULL; + vlan->tinfo.tunnel_id = 0; + + return err; +} + +/* Must be protected by RTNL. + * Must be called with vid in range from 1 to 4094 inclusive. + */ +int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + ASSERT_RTNL(); + + vg = nbp_vlan_group(port); + vlan = br_vlan_find(vg, vid); + if (!vlan) + return -EINVAL; + + return __vlan_tunnel_info_add(vg, vlan, tun_id); +} + +/* Must be protected by RTNL. + * Must be called with vid in range from 1 to 4094 inclusive. + */ +int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + + ASSERT_RTNL(); + + vg = nbp_vlan_group(port); + v = br_vlan_find(vg, vid); + if (!v) + return -ENOENT; + + vlan_tunnel_info_del(vg, v); + + return 0; +} + +static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg) +{ + struct net_bridge_vlan *vlan, *tmp; + + list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) + vlan_tunnel_info_del(vg, vlan); +} + +void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) +{ + struct net_bridge_vlan_group *vg; + + ASSERT_RTNL(); + + vg = nbp_vlan_group(port); + __vlan_tunnel_info_flush(vg); +} + +int vlan_tunnel_init(struct net_bridge_vlan_group *vg) +{ + return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params); +} + +void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) +{ + rhashtable_destroy(&vg->tunnel_hash); +} + +int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) +{ + struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); + struct net_bridge_vlan *vlan; + + if (!vg || !tinfo) + return 0; + + /* if already tagged, ignore */ + if (skb_vlan_tagged(skb)) + return 0; + + /* lookup vid, given tunnel id */ + vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); + if (!vlan) + return 0; + + skb_dst_drop(skb); + + __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); + + return 0; +} + +int br_handle_egress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_vlan *vlan) +{ + int err; + + if (!vlan || !vlan->tinfo.tunnel_id) + return 0; + + if (unlikely(!skb_vlan_tag_present(skb))) + return 0; + + skb_dst_drop(skb); + err = skb_vlan_pop(skb); + if (err) + return err; + + skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst)); + + return 0; +} diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 9024283d2bca..279527f8b1fe 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par) expected_length += ebt_mac_wormhash_size(wh_src); if (em->match_size != EBT_ALIGN(expected_length)) { - pr_info("wrong size: %d against expected %d, rounded to %Zd\n", + pr_info("wrong size: %d against expected %d, rounded to %zd\n", em->match_size, expected_length, EBT_ALIGN(expected_length)); return -EINVAL; diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index 517e78befcb2..61a9f1be1263 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -105,6 +105,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = { .match = ebt_limit_mt, .checkentry = ebt_limit_mt_check, .matchsize = sizeof(struct ebt_limit_info), + .usersize = offsetof(struct ebt_limit_info, prev), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct ebt_compat_limit_info), #endif diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index e88bd4827ac1..98b9c8e8615e 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -78,7 +78,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, unsigned int bitmask; /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) return; spin_lock_bh(&ebt_log_lock); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 537e3d506fc2..79b69917f521 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1346,56 +1346,72 @@ static int update_counters(struct net *net, const void __user *user, hlp.num_counters, user, len); } -static inline int ebt_make_matchname(const struct ebt_entry_match *m, - const char *base, char __user *ubase) +static inline int ebt_obj_to_user(char __user *um, const char *_name, + const char *data, int entrysize, + int usersize, int datasize) { - char __user *hlp = ubase + ((char *)m - base); - char name[EBT_FUNCTION_MAXNAMELEN] = {}; + char name[EBT_FUNCTION_MAXNAMELEN] = {0}; /* ebtables expects 32 bytes long names but xt_match names are 29 bytes * long. Copy 29 bytes and fill remaining bytes with zeroes. */ - strlcpy(name, m->u.match->name, sizeof(name)); - if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) + strlcpy(name, _name, sizeof(name)); + if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || + put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || + xt_data_to_user(um + entrysize, data, usersize, datasize)) return -EFAULT; + return 0; } -static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, - const char *base, char __user *ubase) +static inline int ebt_match_to_user(const struct ebt_entry_match *m, + const char *base, char __user *ubase) { - char __user *hlp = ubase + ((char *)w - base); - char name[EBT_FUNCTION_MAXNAMELEN] = {}; + return ebt_obj_to_user(ubase + ((char *)m - base), + m->u.match->name, m->data, sizeof(*m), + m->u.match->usersize, m->match_size); +} - strlcpy(name, w->u.watcher->name, sizeof(name)); - if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) - return -EFAULT; - return 0; +static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, + const char *base, char __user *ubase) +{ + return ebt_obj_to_user(ubase + ((char *)w - base), + w->u.watcher->name, w->data, sizeof(*w), + w->u.watcher->usersize, w->watcher_size); } -static inline int ebt_make_names(struct ebt_entry *e, const char *base, - char __user *ubase) +static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, + char __user *ubase) { int ret; char __user *hlp; const struct ebt_entry_target *t; - char name[EBT_FUNCTION_MAXNAMELEN] = {}; - if (e->bitmask == 0) + if (e->bitmask == 0) { + /* special case !EBT_ENTRY_OR_ENTRIES */ + if (copy_to_user(ubase + ((char *)e - base), e, + sizeof(struct ebt_entries))) + return -EFAULT; return 0; + } + + if (copy_to_user(ubase + ((char *)e - base), e, sizeof(*e))) + return -EFAULT; hlp = ubase + (((char *)e + e->target_offset) - base); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); - ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase); + ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); if (ret != 0) return ret; - ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); + ret = EBT_WATCHER_ITERATE(e, ebt_watcher_to_user, base, ubase); if (ret != 0) return ret; - strlcpy(name, t->u.target->name, sizeof(name)); - if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) - return -EFAULT; + ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t), + t->u.target->usersize, t->target_size); + if (ret != 0) + return ret; + return 0; } @@ -1475,13 +1491,9 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, if (ret) return ret; - if (copy_to_user(tmp.entries, entries, entries_size)) { - BUGPRINT("Couldn't copy entries to userspace\n"); - return -EFAULT; - } /* set the match/watcher/target names right */ return EBT_ENTRY_ITERATE(entries, entries_size, - ebt_make_names, entries, tmp.entries); + ebt_entry_to_user, entries, tmp.entries); } static int do_ebt_set_ctl(struct sock *sk, @@ -1630,8 +1642,10 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, if (match->compat_to_user) { if (match->compat_to_user(cm->data, m->data)) return -EFAULT; - } else if (copy_to_user(cm->data, m->data, msize)) + } else { + if (xt_data_to_user(cm->data, m->data, match->usersize, msize)) return -EFAULT; + } *size -= ebt_compat_entry_padsize() + off; *dstptr = cm->data; @@ -1657,8 +1671,10 @@ static int compat_target_to_user(struct ebt_entry_target *t, if (target->compat_to_user) { if (target->compat_to_user(cm->data, t->data)) return -EFAULT; - } else if (copy_to_user(cm->data, t->data, tsize)) - return -EFAULT; + } else { + if (xt_data_to_user(cm->data, t->data, target->usersize, tsize)) + return -EFAULT; + } *size -= ebt_compat_entry_padsize() + off; *dstptr = cm->data; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 3408ed51b611..1816fc9f1ee7 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -44,7 +44,6 @@ enum caif_states { struct chnl_net { struct cflayer chnl; - struct net_device_stats stats; struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; diff --git a/net/can/af_can.c b/net/can/af_can.c index 1108079d934f..5488e4a6ccd0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -445,6 +445,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification + * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given @@ -468,7 +469,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, */ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, - char *ident) + char *ident, struct sock *sk) { struct receiver *r; struct hlist_head *rl; @@ -496,6 +497,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, r->func = func; r->data = data; r->ident = ident; + r->sk = sk; hlist_add_head_rcu(&r->list, rl); d->entries++; @@ -520,8 +522,11 @@ EXPORT_SYMBOL(can_rx_register); static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *r = container_of(rp, struct receiver, rcu); + struct sock *sk = r->sk; kmem_cache_free(rcv_cache, r); + if (sk) + sock_put(sk); } /** @@ -596,8 +601,11 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, spin_unlock(&can_rcvlists_lock); /* schedule the receiver item for deletion */ - if (r) + if (r) { + if (r->sk) + sock_hold(r->sk); call_rcu(&r->rcu, can_rx_delete_receiver); + } } EXPORT_SYMBOL(can_rx_unregister); diff --git a/net/can/af_can.h b/net/can/af_can.h index fca0fe9fc45a..b86f5129e838 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -50,13 +50,14 @@ struct receiver { struct hlist_node list; - struct rcu_head rcu; canid_t can_id; canid_t mask; unsigned long matches; void (*func)(struct sk_buff *, void *); void *data; char *ident; + struct sock *sk; + struct rcu_head rcu; }; #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) diff --git a/net/can/bcm.c b/net/can/bcm.c index 21ac75390e3d..95d13b233c65 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -734,14 +734,23 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, static void bcm_remove_op(struct bcm_op *op) { - hrtimer_cancel(&op->timer); - hrtimer_cancel(&op->thrtimer); - - if (op->tsklet.func) - tasklet_kill(&op->tsklet); + if (op->tsklet.func) { + while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || + test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || + hrtimer_active(&op->timer)) { + hrtimer_cancel(&op->timer); + tasklet_kill(&op->tsklet); + } + } - if (op->thrtsklet.func) - tasklet_kill(&op->thrtsklet); + if (op->thrtsklet.func) { + while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || + test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || + hrtimer_active(&op->thrtimer)) { + hrtimer_cancel(&op->thrtimer); + tasklet_kill(&op->thrtsklet); + } + } if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -1216,7 +1225,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, err = can_rx_register(dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, - "bcm"); + "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); @@ -1225,7 +1234,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } else err = can_rx_register(NULL, op->can_id, REGMASK(op->can_id), - bcm_rx_handler, op, "bcm"); + bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); diff --git a/net/can/gw.c b/net/can/gw.c index a54ab0c82104..7056a1a2bb70 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -442,7 +442,7 @@ static inline int cgw_register_filter(struct cgw_job *gwj) { return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, - gwj, "gw"); + gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct cgw_job *gwj) diff --git a/net/can/raw.c b/net/can/raw.c index b075f028d7e2..6dc546a06673 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -190,7 +190,7 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk, for (i = 0; i < count; i++) { err = can_rx_register(dev, filter[i].can_id, filter[i].can_mask, - raw_rcv, sk, "raw"); + raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) @@ -211,7 +211,7 @@ static int raw_enable_errfilter(struct net_device *dev, struct sock *sk, if (err_mask) err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, - raw_rcv, sk, "raw"); + raw_rcv, sk, "raw", sk); return err; } diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 50f040fdb2a9..b9233b990399 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", __func__, lock_name, type, cookie, tag, desc, flags); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - lock_op_page, lock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, lock_op_page, + lock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(lock_op_page); @@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - unlock_op_page, unlock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, unlock_op_page, + unlock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(unlock_op_page); @@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, cookie, ENTITY_NAME(*locker)); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - break_op_page, break_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, break_op_page, + break_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(break_op_page); @@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc, int get_info_op_buf_size; int name_len = strlen(lock_name); struct page *get_info_op_page, *reply_page; - size_t reply_len; + size_t reply_len = PAGE_SIZE; void *p, *end; int ret; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) { kfree(b->item_weights); kfree(b->sum_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b->node_weights); kfree(b); @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { kfree(b->straws); kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 130ab407c5ec..b5cd8c21bfdf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size return -1; } - /* * bucket choose methods * @@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size * Since this is expensive, we optimize for the r=0 case, which * captures the vast majority of calls. */ -static int bucket_perm_choose(struct crush_bucket *bucket, +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, int x, int r) { unsigned int pr = r % bucket->size; unsigned int i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + if (work->perm_x != (__u32)x || work->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; + work->perm_x = x; /* optimize common r=0 case */ if (pr == 0) { s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ goto out; } for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { /* clean up after the r=0 case above */ for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; } /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) - dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned int p = bucket->perm_n; + for (i = 0; i < work->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; /* no point in swapping the final entry */ if (p < bucket->size - 1) { i = crush_hash32_3(bucket->hash, x, bucket->id, p) % (bucket->size - p); if (i) { - unsigned int t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; } dprintk(" perm_choose swap %d with %d\n", p, p+i); } - bucket->perm_n++; + work->perm_n++; } for (i = 0; i < bucket->size; i++) - dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + dprintk(" perm_choose %d: %d\n", i, work->perm[i]); - s = bucket->perm[pr]; + s = work->perm[pr]; out: dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, bucket->size, x, r, pr, s); @@ -132,14 +132,14 @@ out: } /* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) { - return bucket_perm_choose(&bucket->h, x, r); + return bucket_perm_choose(&bucket->h, work, x, r); } /* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, +static int bucket_list_choose(const struct crush_bucket_list *bucket, int x, int r) { int i; @@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, w *= bucket->sum_weights[i]; w = w >> 16; /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) + if (w < bucket->item_weights[i]) { return bucket->h.items[i]; + } } dprintk("bad list sums for bucket %d\n", bucket->h.id); @@ -192,7 +193,7 @@ static int terminal(int x) return x & 1; } -static int bucket_tree_choose(struct crush_bucket_tree *bucket, +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, int x, int r) { int n; @@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, /* straw */ -static int bucket_straw_choose(struct crush_bucket_straw *bucket, +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, int x, int r) { __u32 i; @@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) * */ -static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, int x, int r) { unsigned int i, high = 0; @@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, high_draw = draw; } } + return bucket->h.items[high]; } -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, + return bucket_list_choose((const struct crush_bucket_list *)in, x, r); case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, + return bucket_tree_choose((const struct crush_bucket_tree *)in, x, r); case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); case CRUSH_BUCKET_STRAW2: - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, - x, r); + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } - /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, @@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, int rep; unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int r; int i; int item = 0; @@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, if (local_fallback_retries > 0 && flocal >= (in->size>>1) && flocal > local_fallback_retries) - item = bucket_perm_choose(in, x, r); + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); else - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, sub_r = r >> (vary_r-1); else sub_r = 0; - if (crush_choose_firstn(map, - map->buckets[-1-item], - weight, weight_max, - x, stable ? 1 : outpos+1, 0, - out2, outpos, count, - recurse_tries, 0, - local_retries, - local_fallback_retries, - 0, - vary_r, - stable, - NULL, - sub_r) <= outpos) + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map, } } - if (!reject) { + if (!reject && !collide) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, weight_max, item, x); - else - reject = 0; } reject: @@ -600,7 +611,8 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, @@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, int *out2, int parent_r) { - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int endpos = outpos + left; int rep; unsigned int ftotal; @@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, break; } - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { - crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - recurse_tries, 0, - 0, NULL, r); + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map, #endif } + +/* + * This takes a chunk of memory and sets it up to be a shiny new + * working area for a CRUSH placement computation. It must be called + * on any newly allocated memory before passing it in to + * crush_do_rule. It may be used repeatedly after that, so long as the + * map has not changed. If the map /has/ changed, you must make sure + * the working size is no smaller than what was allocated and re-run + * crush_init_workspace. + * + * If you do retain the working space between calls to crush, make it + * thread-local. + */ +void crush_init_workspace(const struct crush_map *map, void *v) +{ + struct crush_work *w = v; + __s32 b; + + /* + * We work by moving through the available space and setting + * values and pointers as we go. + * + * It's a bit like Forth's use of the 'allot' word since we + * set the pointer first and then reserve the space for it to + * point to by incrementing the point. + */ + v += sizeof(struct crush_work *); + w->work = v; + v += map->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < map->max_buckets; ++b) { + if (!map->buckets[b]) + continue; + + w->work[b] = v; + switch (map->buckets[b]->alg) { + default: + v += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = v; + v += map->buckets[b]->size * sizeof(__u32); + } + BUG_ON(v - (void *)w != map->working_size); +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map, * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector - * @scratch: scratch vector for private use; must be >= 3 * result_max + * @cwin: pointer to at least crush_work_size() bytes of memory */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - int *scratch) + void *cwin) { int result_len; - int *a = scratch; - int *b = scratch + result_max; - int *c = scratch + result_max*2; + struct crush_work *cw = cwin; + int *a = cwin + map->working_size; + int *b = a + result_max; + int *c = b + result_max; + int *w = a; + int *o = b; int recurse_to_leaf; - int *w; int wsize = 0; - int *o; int osize; int *tmp; - struct crush_rule *rule; + const struct crush_rule *rule; __u32 step; int i, j; int numrep; @@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map, rule = map->rules[ruleno]; result_len = 0; - w = a; - o = b; for (step = 0; step < rule->len; step++) { int firstn = 0; - struct crush_rule_step *curstep = &rule->steps[step]; + const struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: @@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, + cw, map->buckets[bno], weight, weight_max, x, numrep, @@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, + cw, map->buckets[bno], weight, weight_max, x, out_size, numrep, @@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map, break; } } + return result_len; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 3949ce70be07..85747b7f91a9 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -3,6 +3,7 @@ #include <linux/err.h> #include <linux/scatterlist.h> +#include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> #include <crypto/skcipher.h> @@ -214,7 +215,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; - char iv[AES_BLOCK_SIZE]; + char iv[AES_BLOCK_SIZE] __aligned(8); int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1)); int crypt_len = encrypt ? in_len + pad_byte : in_len; int ret; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 770c52701efa..bad3d4ae43f6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3425,7 +3425,7 @@ static void ceph_msg_release(struct kref *kref) struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, - atomic_read(&msg->kref.refcount)); + kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } @@ -3434,7 +3434,7 @@ EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, - atomic_read(&msg->kref.refcount)); + kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 842f049abb86..b65bbf9f45eb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -438,7 +438,7 @@ static void ceph_osdc_release_request(struct kref *kref) void ceph_osdc_get_request(struct ceph_osd_request *req) { dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); + kref_read(&req->r_kref)); kref_get(&req->r_kref); } EXPORT_SYMBOL(ceph_osdc_get_request); @@ -447,7 +447,7 @@ void ceph_osdc_put_request(struct ceph_osd_request *req) { if (req) { dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); + kref_read(&req->r_kref)); kref_put(&req->r_kref, ceph_osdc_release_request); } } @@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) kref_init(&req->r_kref); init_completion(&req->r_completion); - init_completion(&req->r_done_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_unsafe_item); @@ -487,11 +486,11 @@ static void request_reinit(struct ceph_osd_request *req) struct ceph_msg *reply_msg = req->r_reply; dout("%s req %p\n", __func__, req); - WARN_ON(atomic_read(&req->r_kref.refcount) != 1); + WARN_ON(kref_read(&req->r_kref) != 1); request_release_checks(req); - WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); - WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); + WARN_ON(kref_read(&request_msg->kref) != 1); + WARN_ON(kref_read(&reply_msg->kref) != 1); target_destroy(&req->r_t); request_init(req); @@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->indata_len -= previous - length; + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); @@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) bool need_send = false; bool promoted = false; - WARN_ON(req->r_tid || req->r_got_reply); + WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: @@ -1704,17 +1704,10 @@ promote: static void account_request(struct ceph_osd_request *req) { - unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); + WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); - if (req->r_flags & CEPH_OSD_FLAG_READ) { - WARN_ON(req->r_flags & mask); - req->r_flags |= CEPH_OSD_FLAG_ACK; - } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) - WARN_ON(!(req->r_flags & mask)); - else - WARN_ON(1); - - WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); } @@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) + if (req->r_callback) { + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); req->r_callback(req); - else - complete_all(&req->r_completion); + } } /* - * Note that this is open-coded in handle_reply(), which has to deal - * with ack vs commit, dup acks, etc. + * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { @@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); __complete_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) cancel_map_check(req); finish_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); - WARN_ON(!__linger_registered(lreq)); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; @@ -2785,31 +2777,8 @@ e_inval: } /* - * We are done with @req if - * - @m is a safe reply, or - * - @m is an unsafe reply and we didn't want a safe one - */ -static bool done_request(const struct ceph_osd_request *req, - const struct MOSDOpReply *m) -{ - return (m->result < 0 || - (m->flags & CEPH_OSD_FLAG_ONDISK) || - !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); -} - -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - * - * ->r_unsafe_callback is set? yes no - * - * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, - * any or needed/got safe) r_done_completion r_done_completion - * - * first reply is unsafe r_unsafe_cb(true) (nothing) - * - * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, - * r_done_completion r_done_completion + * Handle MOSDOpReply. Set ->r_result and call the callback if it is + * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { @@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; - bool already_acked; int ret; int i; @@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } - dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, - req, req->r_tid, req->r_got_reply, m.result, data_len); - - already_acked = req->r_got_reply; - if (!already_acked) { - req->r_result = m.result ?: data_len; - req->r_replay_version = m.replay_version; /* struct */ - req->r_got_reply = true; - } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { - dout("req %p tid %llu dup ack\n", req, req->r_tid); - goto out_unlock_session; - } - - if (done_request(req, &m)) { - finish_request(req); - if (req->r_linger) { - WARN_ON(req->r_unsafe_callback); - dout("req %p tid %llu cb (locked)\n", req, req->r_tid); - __complete_request(req); - } - } + dout("%s req %p tid %llu result %d data_len %u\n", __func__, + req, req->r_tid, m.result, data_len); + /* + * Since we only ever request ONDISK, we should only ever get + * one (type of) reply back. + */ + WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_result = m.result ?: data_len; + finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); - if (done_request(req, &m)) { - if (already_acked && req->r_unsafe_callback) { - dout("req %p tid %llu safe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, false); - } else if (!req->r_linger) { - dout("req %p tid %llu cb\n", req, req->r_tid); - __complete_request(req); - } - complete_all(&req->r_done_completion); - ceph_osdc_put_request(req); - } else { - if (req->r_unsafe_callback) { - dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, true); - } else { - WARN_ON(1); - } - } - + __complete_request(req); + complete_all(&req->r_completion); + ceph_osdc_put_request(req); return; fail_request: @@ -3540,7 +3480,7 @@ again: up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); + wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } @@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + lreq->t.flags = CEPH_OSD_FLAG_WRITE; lreq->mtime = CURRENT_TIME; lreq->reg_req = alloc_linger_request(lreq); @@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + req->r_flags = CEPH_OSD_FLAG_WRITE; req->r_mtime = CURRENT_TIME; osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map); * Execute an OSD class method on an object. * * @flags: CEPH_OSD_FLAG_* - * @resp_len: out param for reply length + * @resp_len: in/out param for reply length */ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_object_id *oid, @@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_osd_request *req; int ret; + if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE)) + return -E2BIG; + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; @@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, 0, false, false); if (resp_page) osd_req_op_cls_response_data_pages(req, 0, &resp_page, - PAGE_SIZE, 0, false, false); + *resp_len, 0, false, false); ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); @@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int page_align = off & ~PAGE_MASK; req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); if (IS_ERR(req)) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d2436880b305..6824c0ec8373 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -153,6 +153,32 @@ bad: return -EINVAL; } +static void crush_finalize(struct crush_map *c) +{ + __s32 b; + + /* Space for the array of pointers to per-bucket workspace */ + c->working_size = sizeof(struct crush_work) + + c->max_buckets * sizeof(struct crush_work_bucket *); + + for (b = 0; b < c->max_buckets; b++) { + if (!c->buckets[b]) + continue; + + switch (c->buckets[b]->alg) { + default: + /* + * The base case, permutation variables and + * the pointer to the permutation array. + */ + c->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + c->working_size += c->buckets[b]->size * sizeof(__u32); + } +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); - if (b->perm == NULL) - goto badmem; - b->perm_n = 0; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) @@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + crush_finalize(c); + done: dout("crush_decode success\n"); return c; @@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + mutex_init(&map->crush_workspace_mutex); return map; } @@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_weight); kfree(map->osd_addr); kfree(map->osd_primary_affinity); + kfree(map->crush_workspace); kfree(map); } @@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } +static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) +{ + void *workspace; + size_t work_size; + + if (IS_ERR(crush)) + return PTR_ERR(crush); + + work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + workspace = kmalloc(work_size, GFP_NOIO); + if (!workspace) { + crush_destroy(crush); + return -ENOMEM; + } + crush_init_workspace(crush, workspace); + + if (map->crush) + crush_destroy(map->crush); + kfree(map->crush_workspace); + map->crush = crush; + map->crush_workspace = workspace; + return 0; +} + #define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 @@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* crush */ ceph_decode_32_safe(p, end, len, e_inval); - map->crush = crush_decode(*p, min(*p + len, end)); - if (IS_ERR(map->crush)) { - err = PTR_ERR(map->crush); - map->crush = NULL; + err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } - *p += len; /* ignore the rest */ *p = end; @@ -1375,7 +1421,6 @@ e_inval: struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map) { - struct crush_map *newcrush = NULL; struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; @@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new crush? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) { - err = PTR_ERR(newcrush); - newcrush = NULL; + err = osdmap_set_crush(map, + crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } *p += len; } @@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->epoch++; map->modified = modified; - if (newcrush) { - if (map->crush) - crush_destroy(map->crush); - map->crush = newcrush; - newcrush = NULL; - } /* new_pools */ err = decode_new_pools(p, end, map); @@ -1505,8 +1542,6 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - if (newcrush) - crush_destroy(newcrush); return ERR_PTR(err); } @@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, BUG_ON(result_max > CEPH_PG_MAX_SIZE); - mutex_lock(&map->crush_scratch_mutex); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_scratch_ary); - mutex_unlock(&map->crush_scratch_mutex); + weight, weight_max, map->crush_workspace); + mutex_unlock(&map->crush_workspace_mutex); return r; } @@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, return; } - len = do_crush(osdmap, ruleno, pps, raw->osds, - min_t(int, pi->size, ARRAY_SIZE(raw->osds)), + if (pi->size > ARRAY_SIZE(raw->osds)) { + pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", + pi->id, pi->crush_ruleset, pi->type, pi->size, + ARRAY_SIZE(raw->osds)); + return; + } + + len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 154683f5f14c..705414e78ae0 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c @@ -18,8 +18,6 @@ * 02110-1301, USA. */ -#include <stddef.h> - #include <linux/types.h> #include <linux/export.h> #include <linux/ceph/libceph.h> diff --git a/net/compat.c b/net/compat.c index 96c544b05b15..aba929e5250f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -22,6 +22,7 @@ #include <linux/filter.h> #include <linux/compat.h> #include <linux/security.h> +#include <linux/audit.h> #include <linux/export.h> #include <net/scm.h> @@ -90,11 +91,11 @@ int get_compat_msghdr(struct msghdr *kmsg, #define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) #define CMSG_COMPAT_DATA(cmsg) \ - ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) + ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr))) #define CMSG_COMPAT_SPACE(len) \ - (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) + (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len)) #define CMSG_COMPAT_LEN(len) \ - (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) + (sizeof(struct compat_cmsghdr) + (len)) #define CMSG_COMPAT_FIRSTHDR(msg) \ (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ @@ -130,6 +131,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, __kernel_size_t kcmlen, tmp; int err = -EFAULT; + BUILD_BUG_ON(sizeof(struct compat_cmsghdr) != + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))); + kcmlen = 0; kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); @@ -141,8 +145,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) return -EINVAL; - tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + - CMSG_ALIGN(sizeof(struct cmsghdr))); + tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); tmp = CMSG_ALIGN(tmp); kcmlen += tmp; ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); @@ -168,8 +171,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, goto Efault; if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) goto Einval; - tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + - CMSG_ALIGN(sizeof(struct cmsghdr))); + tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) goto Einval; kcmsg->cmsg_len = tmp; @@ -178,7 +180,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || copy_from_user(CMSG_DATA(kcmsg), CMSG_COMPAT_DATA(ucmsg), - (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) + (ucmlen - sizeof(*ucmsg)))) goto Efault; /* Advance. */ @@ -781,14 +783,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { - int ret; - u32 a[6]; + u32 a[AUDITSC_ARGS]; + unsigned int len; u32 a0, a1; + int ret; if (call < SYS_SOCKET || call > SYS_SENDMMSG) return -EINVAL; - if (copy_from_user(a, args, nas[call])) + len = nas[call]; + if (len > sizeof(a)) + return -EINVAL; + + if (copy_from_user(a, args, len)) return -EFAULT; + + ret = audit_socketcall_compat(len / sizeof(a[0]), a); + if (ret) + return ret; + a0 = a[0]; a1 = a[1]; diff --git a/net/core/Makefile b/net/core/Makefile index f6761b6e3b29..79f9479e9658 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o +obj-$(CONFIG_GRO_CELLS) += gro_cells.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 662bea587165..ea633342ab0d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -332,7 +332,9 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) EXPORT_SYMBOL(__skb_free_datagram_locked); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, - unsigned int flags) + unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb)) { int err = 0; @@ -342,6 +344,8 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, if (skb == skb_peek(&sk->sk_receive_queue)) { __skb_unlink(skb, &sk->sk_receive_queue); atomic_dec(&skb->users); + if (destructor) + destructor(sk, skb); err = 0; } spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -375,7 +379,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = __sk_queue_drop_skb(sk, skb, flags); + int err = __sk_queue_drop_skb(sk, skb, flags, NULL); kfree_skb(skb); sk_mem_reclaim_partial(sk); diff --git a/net/core/dev.c b/net/core/dev.c index 8db5a0b4b520..8637b2b71f3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1,5 +1,5 @@ /* - * NET3 Protocol independent device support routines. + * NET3 Protocol independent device support routines. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro + * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * @@ -21,9 +21,9 @@ * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. @@ -36,7 +36,7 @@ * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before @@ -46,7 +46,7 @@ * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close + * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() @@ -67,8 +67,8 @@ * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ @@ -192,7 +192,8 @@ static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0); + while (++net->dev_base_seq == 0) + ; } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) @@ -274,8 +275,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ -static const unsigned short netdev_lock_type[] = - {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, @@ -291,22 +292,22 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; -static const char *const netdev_lock_name[] = - {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -352,10 +353,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) #endif /******************************************************************************* + * + * Protocol management and registration routines + * + *******************************************************************************/ - Protocol management and registration routines - -*******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is @@ -538,10 +540,10 @@ void dev_remove_offload(struct packet_offload *po) EXPORT_SYMBOL(dev_remove_offload); /****************************************************************************** - - Device Boot-time Settings Routines - -*******************************************************************************/ + * + * Device Boot-time Settings Routines + * + ******************************************************************************/ /* Boot time configuration table */ static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; @@ -574,13 +576,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map) } /** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. */ int netdev_boot_setup_check(struct net_device *dev) { @@ -590,10 +592,10 @@ int netdev_boot_setup_check(struct net_device *dev) for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; return 1; } } @@ -603,14 +605,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check); /** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. */ unsigned long netdev_boot_base(const char *prefix, int unit) { @@ -663,10 +665,10 @@ int __init netdev_boot_setup(char *str) __setup("netdev=", netdev_boot_setup); /******************************************************************************* - - Device Interface Subroutines - -*******************************************************************************/ + * + * Device Interface Subroutines + * + *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface @@ -737,15 +739,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name) EXPORT_SYMBOL(__dev_get_by_name); /** - * dev_get_by_name_rcu - find a device by its name - * @net: the applicable net namespace - * @name: name to find + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find * - * Find an interface by name. - * If the name is found a pointer to the device is returned. - * If the name is not found then %NULL is returned. - * The reference counters are not incremented so the caller must be - * careful with locks. The caller must hold RCU lock. + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) @@ -1289,8 +1291,8 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** - * netdev_notify_peers - notify network peers about existence of @dev - * @dev: network device + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when @@ -1518,17 +1520,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, static int dev_boot_phase = 1; /** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier + * register_netdevice_notifier - register a network notifier block + * @nb: notifier * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) @@ -1585,17 +1587,17 @@ outroll: EXPORT_SYMBOL(register_netdevice_notifier); /** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. * - * After unregistering unregister and down device events are synthesized - * for all devices on the device list to the removed notifier to remove - * the need for special case cleanup code. + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) @@ -1695,37 +1697,59 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call static_key_slow_dec() from irq context - * If net_disable_timestamp() is called from irq context, defer the - * static_key_slow_dec() calls. - */ static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; +static void netstamp_clear(struct work_struct *work) +{ + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; + + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_key_enable(&netstamp_needed); + else + static_key_disable(&netstamp_needed); +} +static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef HAVE_JUMP_LABEL - int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; - if (deferred) { - while (--deferred) - static_key_slow_dec(&netstamp_needed); - return; + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + return; } -#endif + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else static_key_slow_inc(&netstamp_needed); +#endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef HAVE_JUMP_LABEL - if (in_interrupt()) { - atomic_inc(&netstamp_needed_deferred); - return; + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + return; } -#endif + atomic_dec(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else static_key_slow_dec(&netstamp_needed); +#endif } EXPORT_SYMBOL(net_disable_timestamp); @@ -2408,28 +2432,6 @@ void netif_schedule_queue(struct netdev_queue *txq) } EXPORT_SYMBOL(netif_schedule_queue); -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { - struct Qdisc *q; - - rcu_read_lock(); - q = rcu_dereference(txq->qdisc); - __netif_schedule(q); - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(netif_wake_subqueue); - void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { @@ -2523,6 +2525,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; qcount = dev->tc_to_txq[tc].count; } @@ -2659,9 +2662,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; - else - return skb->ip_summed == CHECKSUM_NONE; + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_NONE; + + return skb->ip_summed == CHECKSUM_NONE; } /** @@ -2680,11 +2684,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { + struct sk_buff *segs; + if (unlikely(skb_needs_check(skb, tx_path))) { int err; - skb_warn_bad_offload(skb); - + /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); @@ -2712,7 +2717,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_reset_mac_len(skb); - return skb_mac_gso_segment(skb, features); + segs = skb_mac_gso_segment(skb, features); + + if (unlikely(skb_needs_check(skb, tx_path))) + skb_warn_bad_offload(skb); + + return segs; } EXPORT_SYMBOL(__skb_gso_segment); @@ -2737,9 +2747,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) return 1; } @@ -2753,6 +2765,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; dma_addr_t addr = page_to_phys(skb_frag_page(frag)); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) return 1; } @@ -2795,9 +2808,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); - } else if (illegal_highdma(skb->dev, skb)) { - features &= ~NETIF_F_SG; } + if (illegal_highdma(skb->dev, skb)) + features &= ~NETIF_F_SG; return features; } @@ -3153,9 +3166,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) if (!cl) return skb; - /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set - * earlier by the caller. - */ + /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res, false)) { @@ -3230,6 +3241,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, skb); + if (new_index < 0) new_index = skb_tx_hash(dev, skb); @@ -3259,6 +3271,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, accel_priv, __netdev_pick_tx); @@ -3320,7 +3333,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); + skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS if (static_key_false(&egress_needed)) { skb = sch_handle_egress(skb, &rc, dev); @@ -3347,16 +3360,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) } /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... + * loopback, all the sorts of tunnels... - Really, it is unlikely that netif_tx_lock protection is necessary - here. (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. + * Really, it is unlikely that netif_tx_lock protection is necessary + * here. (f.e. loopback and IP tunnels are clean ignoring statistics + * counters.) + * However, it is possible, that they rely on protection + * made by us here. - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) + * Check this and shot the lock. It is not prone from deadlocks. + *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ @@ -3418,16 +3431,20 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) EXPORT_SYMBOL(dev_queue_xmit_accel); -/*======================================================================= - Receiver routines - =======================================================================*/ +/************************************************************************* + * Receiver routines + *************************************************************************/ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ +int dev_rx_weight __read_mostly = 64; +int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -3784,6 +3801,7 @@ static int netif_rx_internal(struct sk_buff *skb) #endif { unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } @@ -3843,6 +3861,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (clist) { struct sk_buff *skb = clist; + clist = clist->next; WARN_ON(atomic_read(&skb->users)); @@ -3916,7 +3935,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res, false)) { @@ -3981,9 +4000,7 @@ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { - ASSERT_RTNL(); - - if (dev->rx_handler) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; /* Note: rx_handler_data must be set before rx_handler */ @@ -4089,12 +4106,8 @@ another_round: goto out; } -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; - } -#endif + if (skb_skip_tc_classify(skb)) + goto skip_classify; if (pfmemalloc) goto skip_taps; @@ -4122,10 +4135,8 @@ skip_taps: goto out; } #endif -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; -ncls: -#endif + skb_reset_tc(skb); +skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -4441,7 +4452,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); } } @@ -4524,6 +4537,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (&ptype->list == head) goto normal; + if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; @@ -4619,6 +4637,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { skb_dst_drop(skb); + secpath_reset(skb); kmem_cache_free(skbuff_head_cache, skb); } else { __kfree_skb(skb); @@ -4627,6 +4646,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) case GRO_HELD: case GRO_MERGED: + case GRO_CONSUMED: break; } @@ -4659,6 +4679,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + secpath_reset(skb); napi->skb = skb; } @@ -4697,6 +4718,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, break; case GRO_MERGED: + case GRO_CONSUMED: break; } @@ -4833,7 +4855,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = weight_p; + napi->weight = dev_rx_weight; while (again) { struct sk_buff *skb; @@ -4889,6 +4911,39 @@ void __napi_schedule(struct napi_struct *n) EXPORT_SYMBOL(__napi_schedule); /** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + +/** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * @@ -4900,26 +4955,9 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -bool __napi_complete(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - - /* Some drivers call us directly, instead of calling - * napi_complete_done(). - */ - if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) - return false; - - list_del_init(&n->poll_list); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - return true; -} -EXPORT_SYMBOL(__napi_complete); - bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags; + unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list @@ -4943,14 +4981,33 @@ bool napi_complete_done(struct napi_struct *n, int work_done) else napi_gro_flush(n, false); } - if (likely(list_empty(&n->poll_list))) { - WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); - } else { + if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); - __napi_complete(n); + list_del_init(&n->poll_list); local_irq_restore(flags); } + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4976,6 +5033,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); @@ -4996,7 +5063,6 @@ bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; int (*napi_poll)(struct napi_struct *napi, int budget); - int (*busy_poll)(struct napi_struct *dev); void *have_poll_lock = NULL; struct napi_struct *napi; int rc; @@ -5011,17 +5077,10 @@ restart: if (!napi) goto out; - /* Note: ndo_busy_poll method is optional in linux-4.5 */ - busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - preempt_disable(); for (;;) { rc = 0; local_bh_disable(); - if (busy_poll) { - rc = busy_poll(napi); - goto count; - } if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5046,9 +5105,6 @@ count: LINUX_MIB_BUSYPOLLRXPACKETS, rc); local_bh_enable(); - if (rc == LL_FLUSH_FAILED) - break; /* permanent failure */ - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || busy_loop_timeout(end_time)) break; @@ -5122,8 +5178,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); - if (napi->gro_list) - napi_schedule(napi); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (napi->gro_list && !napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } @@ -5709,6 +5770,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), @@ -5719,6 +5781,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); @@ -5988,6 +6051,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info; + ASSERT_RTNL(); changeupper_info.upper_dev = upper_dev; @@ -6154,50 +6218,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev, } EXPORT_SYMBOL(netdev_lower_state_changed); -int netdev_default_l2upper_neigh_construct(struct net_device *dev, - struct neighbour *n) -{ - struct net_device *lower_dev, *stop_dev; - struct list_head *iter; - int err; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (!lower_dev->netdev_ops->ndo_neigh_construct) - continue; - err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); - if (err) { - stop_dev = lower_dev; - goto rollback; - } - } - return 0; - -rollback: - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (lower_dev == stop_dev) - break; - if (!lower_dev->netdev_ops->ndo_neigh_destroy) - continue; - lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); - } - return err; -} -EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); - -void netdev_default_l2upper_neigh_destroy(struct net_device *dev, - struct neighbour *n) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (!lower_dev->netdev_ops->ndo_neigh_destroy) - continue; - lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); - } -} -EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); - static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6450,8 +6470,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. + * is important. Some (broken) drivers set IFF_PROMISC, when + * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; @@ -6749,6 +6769,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd); static int dev_new_index(struct net *net) { int ifindex = net->ifindex; + for (;;) { if (++ifindex <= 0) ifindex = 1; @@ -6815,8 +6836,8 @@ static void rollback_registered_many(struct list_head *head) /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ + * this device. They should clean all the things. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || @@ -6974,13 +6995,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } -#ifdef CONFIG_NET_RX_BUSY_POLL - if (dev->netdev_ops->ndo_busy_poll) - features |= NETIF_F_BUSY_POLL; - else -#endif - features &= ~NETIF_F_BUSY_POLL; - return features; } @@ -7169,6 +7183,7 @@ void netif_tx_stop_all_queues(struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); } } @@ -7643,17 +7658,17 @@ void netdev_freemem(struct net_device *dev) } /** - * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @name_assign_type: origin of device name - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate - * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subqueue structs - * for each queue on the device. + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, @@ -7765,13 +7780,13 @@ free_dev: EXPORT_SYMBOL(alloc_netdev_mqs); /** - * free_netdev - free network device - * @dev: device + * free_netdev - free network device + * @dev: device * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. - * Must be called in process context. + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. If this + * is the last reference then it will be freed.Must be called in process + * context. */ void free_netdev(struct net_device *dev) { @@ -7953,12 +7968,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_shutdown(dev); /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - - Note that dev->reg_state stays at NETREG_REGISTERED. - This is wanted because this way 8021q and macvlan know - the device is just moving and can keep their slaves up. - */ + * this device. They should clean all the things. + * + * Note that dev->reg_state stays at NETREG_REGISTERED. + * This is wanted because this way 8021q and macvlan know + * the device is just moving and can keep their slaves up. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); diff --git a/net/core/devlink.c b/net/core/devlink.c index 2b5bf9efa720..e9c1e6acfb6d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1392,9 +1392,9 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } -static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) +static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; void *hdr; @@ -1408,50 +1408,52 @@ static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_put_handle(msg, devlink); if (err) - goto out; + goto nla_put_failure; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - goto out; - err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); - if (err) - goto out; + if (ops->eswitch_mode_get) { + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto nla_put_failure; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto nla_put_failure; + } if (ops->eswitch_inline_mode_get) { err = ops->eswitch_inline_mode_get(devlink, &inline_mode); if (err) - goto out; + goto nla_put_failure; err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, inline_mode); if (err) - goto out; + goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; -out: +nla_put_failure: genlmsg_cancel(msg, hdr); return err; } -static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; int err; - if (!ops || !ops->eswitch_mode_get) + if (!ops) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; - err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0); + err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1461,8 +1463,8 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; @@ -1629,15 +1631,15 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_LOCK_PORTS, }, { - .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, - .doit = devlink_nl_cmd_eswitch_mode_get_doit, + .cmd = DEVLINK_CMD_ESWITCH_GET, + .doit = devlink_nl_cmd_eswitch_get_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { - .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, - .doit = devlink_nl_cmd_eswitch_mode_set_doit, + .cmd = DEVLINK_CMD_ESWITCH_SET, + .doit = devlink_nl_cmd_eswitch_set_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 8e0c0635ee97..fb55327dcfea 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -75,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) struct nlattr *nla; struct sk_buff *skb; unsigned long flags; + void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); @@ -82,21 +83,41 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) skb = genlmsg_new(al, GFP_KERNEL); - if (skb) { - genlmsg_put(skb, 0, 0, &net_drop_monitor_family, - 0, NET_DM_CMD_ALERT); - nla = nla_reserve(skb, NLA_UNSPEC, - sizeof(struct net_dm_alert_msg)); - msg = nla_data(nla); - memset(msg, 0, al); - } else { - mod_timer(&data->send_timer, jiffies + HZ / 10); + if (!skb) + goto err; + + msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + if (!msg_header) { + nlmsg_free(skb); + skb = NULL; + goto err; + } + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + if (!nla) { + nlmsg_free(skb); + skb = NULL; + goto err; } + msg = nla_data(nla); + memset(msg, 0, al); + goto out; +err: + mod_timer(&data->send_timer, jiffies + HZ / 10); +out: spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); spin_unlock_irqrestore(&data->lock, flags); + if (skb) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); + + genlmsg_end(skb, genlmsg_data(gnlh)); + } + return skb; } diff --git a/net/core/dst.c b/net/core/dst.c index b5cbbe07f786..960e503b5a52 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e23766c7e3ba..be7bab1adcde 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -102,7 +102,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_BUSY_POLL_BIT] = "busy-poll", [NETIF_F_HW_TC_BIT] = "hw-tc-offload", }; @@ -1405,9 +1404,12 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = vzalloc(reglen); - if (reglen && !regbuf) - return -ENOMEM; + regbuf = NULL; + if (reglen) { + regbuf = vzalloc(reglen); + if (!regbuf) + return -ENOMEM; + } ops->get_regs(dev, ®s, regbuf); @@ -1712,7 +1714,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max; + struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; u32 max_rx_in_use = 0; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) @@ -1817,11 +1819,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; + if (ret > S32_MAX / ETH_GSTRING_LEN) + return -ENOMEM; + WARN_ON_ONCE(!ret); gstrings.len = ret; - - data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); - if (!data) + data = vzalloc(gstrings.len * ETH_GSTRING_LEN); + if (gstrings.len && !data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); @@ -1830,12 +1834,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); - if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + if (gstrings.len && + copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } @@ -1912,14 +1917,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; - WARN_ON(n_stats == 0); - + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; - data = kmalloc(n_stats * sizeof(u64), GFP_USER); - if (!data) + data = vzalloc(n_stats * sizeof(u64)); + if (n_stats && !data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); @@ -1928,12 +1934,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } @@ -1948,17 +1954,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return -EOPNOTSUPP; n_stats = phy_get_sset_count(phydev); - if (n_stats < 0) return n_stats; - WARN_ON(n_stats == 0); + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; - data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); - if (!data) + data = vzalloc(n_stats * sizeof(u64)); + if (n_stats && !data) return -ENOMEM; mutex_lock(&phydev->lock); @@ -1969,12 +1976,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } diff --git a/net/core/filter.c b/net/core/filter.c index e6c412b94dec..ebaeaf2e46e8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -76,9 +76,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ - if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; - + } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; @@ -1416,8 +1417,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1447,8 +1448,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_RAW_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -1522,10 +1523,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; + bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; - if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | - BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1533,7 +1535,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, return -EFAULT; ptr = (__sum16 *)(skb->data + offset); - if (is_mmzero && !*ptr) + if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { @@ -1601,10 +1603,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2306,8 +2308,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -2377,8 +2379,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2412,8 +2414,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; @@ -2483,8 +2485,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2509,8 +2511,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * @@ -2582,8 +2584,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, - bpf_xdp_copy); + return bpf_event_output(map, flags, meta, meta_size, xdp->data, + xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { @@ -2593,12 +2595,12 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -2626,6 +2628,17 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -2680,7 +2693,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2695,7 +2708,7 @@ xdp_func_proto(enum bpf_func_id func_id) case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2706,7 +2719,7 @@ cg_skb_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2733,7 +2746,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2776,11 +2789,22 @@ static bool __is_valid_access(int off, int size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + if (off + size > + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) + return false; + break; + default: + if (size != sizeof(__u32)) + return false; + } return true; } @@ -2799,7 +2823,7 @@ static bool sk_filter_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2823,7 +2847,7 @@ static bool lwt_is_valid_access(int off, int size, case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2915,7 +2939,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: case offsetof(struct __sk_buff, tc_classid): break; default: @@ -2972,38 +2996,33 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -void bpf_warn_invalid_xdp_buffer(void) -{ - WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n"); -} -EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer); - -static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static u32 bpf_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; + int off; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, len): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct __sk_buff, protocol): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, protocol)); break; case offsetof(struct __sk_buff, vlan_proto): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, vlan_proto)); break; @@ -3011,17 +3030,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); break; case offsetof(struct __sk_buff, ingress_ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, skb_iif)); break; @@ -3029,17 +3048,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct __sk_buff, hash): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, hash)); break; @@ -3047,63 +3066,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, vlan_present): return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, vlan_tci): return convert_skb_access(SKF_AD_VLAN_TAG, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); prog->cb_access = 1; - ctx_off -= offsetof(struct __sk_buff, cb[0]); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, data); + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): - ctx_off -= offsetof(struct __sk_buff, tc_classid); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_end): - ctx_off -= offsetof(struct __sk_buff, data_end); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, - ctx_off); + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): @@ -3111,110 +3144,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); - break; #else if (type == BPF_WRITE) - *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else - *insn++ = BPF_MOV64_IMM(dst_reg, 0); - break; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif + break; } return insn - insn_buf; } static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - int dst_reg, int src_reg, - int ctx_off, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sock, sk_family)); break; case offsetof(struct bpf_sock, type): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock, protocol): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; } return insn - insn_buf; } -static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; default: - return sk_filter_convert_ctx_access(type, dst_reg, src_reg, - ctx_off, insn_buf, prog); + return bpf_convert_ctx_access(type, si, insn_buf, prog); } return insn - insn_buf; } -static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; } @@ -3225,7 +3255,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { @@ -3244,69 +3274,69 @@ static const struct bpf_verifier_ops xdp_ops = { static const struct bpf_verifier_ops cg_skb_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, }; static const struct bpf_verifier_ops lwt_inout_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, }; static const struct bpf_verifier_ops lwt_xmit_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops cg_sock_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = bpf_base_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -static struct bpf_prog_type_list sk_filter_type __read_mostly = { +static struct bpf_prog_type_list sk_filter_type __ro_after_init = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; -static struct bpf_prog_type_list sched_cls_type __read_mostly = { +static struct bpf_prog_type_list sched_cls_type __ro_after_init = { .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_CLS, }; -static struct bpf_prog_type_list sched_act_type __read_mostly = { +static struct bpf_prog_type_list sched_act_type __ro_after_init = { .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_ACT, }; -static struct bpf_prog_type_list xdp_type __read_mostly = { +static struct bpf_prog_type_list xdp_type __ro_after_init = { .ops = &xdp_ops, .type = BPF_PROG_TYPE_XDP, }; -static struct bpf_prog_type_list cg_skb_type __read_mostly = { +static struct bpf_prog_type_list cg_skb_type __ro_after_init = { .ops = &cg_skb_ops, .type = BPF_PROG_TYPE_CGROUP_SKB, }; -static struct bpf_prog_type_list lwt_in_type __read_mostly = { +static struct bpf_prog_type_list lwt_in_type __ro_after_init = { .ops = &lwt_inout_ops, .type = BPF_PROG_TYPE_LWT_IN, }; -static struct bpf_prog_type_list lwt_out_type __read_mostly = { +static struct bpf_prog_type_list lwt_out_type __ro_after_init = { .ops = &lwt_inout_ops, .type = BPF_PROG_TYPE_LWT_OUT, }; -static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { +static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = { .ops = &lwt_xmit_ops, .type = BPF_PROG_TYPE_LWT_XMIT, }; -static struct bpf_prog_type_list cg_sock_type __read_mostly = { +static struct bpf_prog_type_list cg_sock_type __ro_after_init = { .ops = &cg_sock_ops, .type = BPF_PROG_TYPE_CGROUP_SOCK }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d6447dc10371..c35aae13c8d2 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -67,8 +67,8 @@ EXPORT_SYMBOL(skb_flow_dissector_init); * The function will try to retrieve a be32 entity at * offset poff */ -__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, - int hlen) +static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, + void *data, int hlen) { __be16 *u, _u; @@ -138,6 +138,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_arp *key_arp; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; @@ -379,6 +380,62 @@ mpls: nhoff += FCOE_HEADER_LEN; goto out_good; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): { + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr *_arp; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + goto out_bad; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + goto out_bad; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, + &_arp_eth); + if (!arp_eth) + goto out_bad; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ARP)) { + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, + sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, + sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + } + + goto out_good; + } + default: goto out_bad; } @@ -468,8 +525,9 @@ ip_proto_again: if (hdr->flags & GRE_ACK) offset += sizeof(((struct pptp_gre_header *)0)->ack); - ppp_hdr = skb_header_pointer(skb, nhoff + offset, - sizeof(_ppp_hdr), _ppp_hdr); + ppp_hdr = __skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), + data, hlen, _ppp_hdr); if (!ppp_hdr) goto out_bad; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c new file mode 100644 index 000000000000..c98bbfbd26b8 --- /dev/null +++ b/net/core/gro_cells.c @@ -0,0 +1,92 @@ +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <net/gro_cells.h> + +struct gro_cell { + struct sk_buff_head napi_skbs; + struct napi_struct napi; +}; + +int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct gro_cell *cell; + + if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) + return netif_rx(skb); + + cell = this_cpu_ptr(gcells->cells); + + if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + __skb_queue_tail(&cell->napi_skbs, skb); + if (skb_queue_len(&cell->napi_skbs) == 1) + napi_schedule(&cell->napi); + return NET_RX_SUCCESS; +} +EXPORT_SYMBOL(gro_cells_receive); + +/* called under BH context */ +static int gro_cell_poll(struct napi_struct *napi, int budget) +{ + struct gro_cell *cell = container_of(napi, struct gro_cell, napi); + struct sk_buff *skb; + int work_done = 0; + + while (work_done < budget) { + skb = __skb_dequeue(&cell->napi_skbs); + if (!skb) + break; + napi_gro_receive(napi, skb); + work_done++; + } + + if (work_done < budget) + napi_complete_done(napi, work_done); + return work_done; +} + +int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) +{ + int i; + + gcells->cells = alloc_percpu(struct gro_cell); + if (!gcells->cells) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + __skb_queue_head_init(&cell->napi_skbs); + + set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); + + netif_napi_add(dev, &cell->napi, gro_cell_poll, + NAPI_POLL_WEIGHT); + napi_enable(&cell->napi); + } + return 0; +} +EXPORT_SYMBOL(gro_cells_init); + +void gro_cells_destroy(struct gro_cells *gcells) +{ + int i; + + if (!gcells->cells) + return; + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + netif_napi_del(&cell->napi); + __skb_queue_purge(&cell->napi_skbs); + } + free_percpu(gcells->cells); + gcells->cells = NULL; +} +EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 71bb3e2eca08..0cfe7b0216c3 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -237,7 +237,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; -static int bpf_build_state(struct net_device *dev, struct nlattr *nla, +static int bpf_build_state(struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -352,7 +352,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 0; } -int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which @@ -386,6 +386,7 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .fill_encap = bpf_fill_encap_info, .get_encap_size = bpf_encap_nlsize, .cmp_encap = bpf_encap_cmp, + .owner = THIS_MODULE, }; static int __init bpf_lwt_init(void) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index a5d4e866ce88..6df9f8fabf0c 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -26,6 +26,7 @@ #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> +#include <net/nexthop.h> #ifdef CONFIG_MODULES @@ -100,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, } EXPORT_SYMBOL(lwtunnel_encap_del_ops); -int lwtunnel_build_state(struct net_device *dev, u16 encap_type, +int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws) { @@ -114,25 +115,77 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state && try_module_get(ops->owner))) { + ret = ops->build_state(encap, family, cfg, lws); + if (ret) + module_put(ops->owner); + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_valid_encap_type(u16 encap_type) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - rcu_read_unlock(); + __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); + rtnl_lock(); + rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); } } #endif - if (likely(ops && ops->build_state)) - ret = ops->build_state(dev, encap, family, cfg, lws); - rcu_read_unlock(); + return ops ? 0 : -EOPNOTSUPP; +} +EXPORT_SYMBOL(lwtunnel_valid_encap_type); - return ret; +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +{ + struct rtnexthop *rtnh = (struct rtnexthop *)attr; + struct nlattr *nla_entype; + struct nlattr *attrs; + struct nlattr *nla; + u16 encap_type; + int attrlen; + + while (rtnh_ok(rtnh, remaining)) { + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_ENCAP); + nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + + if (nla_entype) { + encap_type = nla_get_u16(nla_entype); + + if (lwtunnel_valid_encap_type(encap_type) != 0) + return -EOPNOTSUPP; + } + } + rtnh = rtnh_next(rtnh, &remaining); + } + + return 0; } -EXPORT_SYMBOL(lwtunnel_build_state); +EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { @@ -144,6 +197,7 @@ void lwtstate_free(struct lwtunnel_state *lws) } else { kfree(lws); } + module_put(ops->owner); } EXPORT_SYMBOL(lwtstate_free); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7bb12e07ffef..e7c12caa20c8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2923,7 +2923,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); - call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 2ec86fc552df..756637dc7a57 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/types.h> +#include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8e69ce472236..96947f5d41e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; /* reset reclass/redir ttl */ -#endif + skb_reset_tc(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5d26056b6d8f..9b8727c67b58 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -34,8 +34,6 @@ * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ -int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); void reqsk_queue_alloc(struct request_sock_queue *queue) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 18b5aae99bec..c4e84c558240 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -837,8 +837,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent) && - (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * @@ -877,8 +876,6 @@ static size_t rtnl_port_size(const struct net_device *dev, { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ - + nla_total_size(sizeof(struct ifla_port_vsi)) - /* PORT_VSI_TYPE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ @@ -1492,14 +1489,19 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, - [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_port_vsi)}, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, + + /* Unused, but we need to keep it here since user space could + * fill it. It's also broken with regard to NLA_BINARY use in + * combination with structs. + */ + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { @@ -2356,7 +2358,6 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { - int err; struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; @@ -2371,11 +2372,10 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - err = -ENOMEM; dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) - goto err; + return ERR_PTR(-ENOMEM); dev_net_set(dev, net); dev->rtnl_link_ops = ops; @@ -2401,9 +2401,6 @@ struct net_device *rtnl_create_link(struct net *net, dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); return dev; - -err: - return ERR_PTR(err); } EXPORT_SYMBOL(rtnl_create_link); @@ -2571,7 +2568,7 @@ replay: return -ENODEV; } - if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { @@ -2653,6 +2650,11 @@ replay: if (err < 0) goto out_unregister; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto out_unregister; + } out: if (link_net) put_net(link_net); @@ -3829,6 +3831,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = 0; } + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { + struct rtnl_af_ops *af_ops; + + *idxattr = IFLA_STATS_AF_SPEC; + attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC); + if (!attr) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_stats_af) { + struct nlattr *af; + int err; + + af = nla_nest_start(skb, af_ops->family); + if (!af) + goto nla_put_failure; + + err = af_ops->fill_stats_af(skb, dev); + + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, attr); + + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3885,6 +3920,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) size += rtnl_get_offload_stats_size(dev); + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { + struct rtnl_af_ops *af_ops; + + /* for IFLA_STATS_AF_SPEC */ + size += nla_total_size(0); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_stats_af_size) { + size += nla_total_size( + af_ops->get_stats_af_size(dev)); + + /* for AF_* */ + size += nla_total_size(0); + } + } + } + return size; } @@ -3898,6 +3950,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) u32 filter_mask; int err; + if (nlmsg_len(nlh) < sizeof(*ifsm)) + return -EINVAL; + ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); @@ -3947,6 +4002,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; + if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) + return -EINVAL; + ifsm = nlmsg_data(cb->nlh); filter_mask = ifsm->filter_mask; if (!filter_mask) diff --git a/net/core/scm.c b/net/core/scm.c index d8820438ba37..b6d83686e149 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -71,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) struct file **fpp; int i, num; - num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 88a8e429fc3e..758f140b6bed 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -1,3 +1,7 @@ +/* + * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/cryptohash.h> @@ -8,18 +12,18 @@ #include <linux/ktime.h> #include <linux/string.h> #include <linux/net.h> - +#include <linux/siphash.h> #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <linux/in6.h> #include <net/tcp.h> -#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; +static siphash_key_t net_secret __read_mostly; static __always_inline void net_secret_init(void) { - net_get_random_once(net_secret, sizeof(net_secret)); + net_get_random_once(&net_secret, sizeof(net_secret)); } #endif @@ -44,80 +48,70 @@ static u32 seq_scale(u32 seq) u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport, u32 *tsoff) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u64 hash; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + hash = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0; + return seq_scale(hash); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .dport = dport + }; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32) daddr[i]; - secret[4] = net_secret[4] + (__force u32)dport; - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - return hash[0]; + return siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET +/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), + * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, + * it would be easy enough to have the former function use siphash_4u32, passing + * the arguments as separate u32. + */ + u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 *tsoff) { - u32 hash[MD5_DIGEST_WORDS]; - + u64 hash; net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0; + return seq_scale(hash); } u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = (__force u32)dport ^ net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; + return siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif @@ -126,21 +120,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; u64 seq; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccp_sequence_number); @@ -149,26 +135,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; u64 seq; - u32 i; - net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccpv6_sequence_number); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a03730fbc1a..f3557958e9bf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(&fclones->fclone_ref, 1); fclones->skb2.fclone = SKB_FCLONE_CLONE; - fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb) skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); + nf_conntrack_put(skb_nfct(skb)); #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); @@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); -#ifdef CONFIG_NET_CLS_ACT - CHECK_SKB_FIELD(tc_verd); -#endif #endif } @@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; - int size = nhead + skb_end_offset(skb) + ntail; + int i, osize = skb_end_offset(skb); + int size = osize + nhead + ntail; long off; + u8 *data; BUG_ON(nhead < 0); @@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + return 0; nofrags: diff --git a/net/core/sock.c b/net/core/sock.c index f560e0826009..e7d74940e863 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_MAX" + "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_MAX" + "slock-AF_QIPCRTR", "slock-AF_SMC" , "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_MAX" + "clock-AF_QIPCRTR", "clock-AF_SMC" , "clock-AF_MAX" }; /* @@ -367,7 +367,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) - *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); return 0; } @@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -762,11 +763,8 @@ set_rcvbuf: goto set_rcvbuf; case SO_KEEPALIVE: -#ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) - tcp_set_keepalive(sk, valbool); -#endif + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; @@ -1148,7 +1146,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.tm.tv_usec = 0; } else { v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; } break; @@ -1159,7 +1157,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.tm.tv_usec = 0; } else { v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; } break; @@ -1522,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) af_family_clock_key_strings[newsk->sk_family]); newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; atomic_set(&newsk->sk_drops, 0); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2a46e4009f62..4ead336e14ea 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_dev_weight(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret != 0) + return ret; + + dev_rx_weight = weight_p * dev_weight_rx_bias; + dev_tx_weight = weight_p * dev_weight_tx_bias; + + return ret; +} + static int proc_do_rss_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = { .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_rx_bias", + .data = &dev_weight_rx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_tx_bias", + .data = &dev_weight_tx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", @@ -305,6 +334,13 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dointvec, }, + { + .procname = "bpf_jit_kallsyms", + .data = &bpf_jit_kallsyms, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, # endif #endif { diff --git a/net/dccp/input.c b/net/dccp/input.c index ba347184bda9..4a05d7876850 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const int old_state = sk->sk_state; + bool acceptable; int queued = 0; /* @@ -603,10 +604,16 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (inet_csk(sk)->icsk_af_ops->conn_request(sk, - skb) < 0) + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) return 1; - goto discard; + consume_skb(skb); + return 0; } if (dh->dccph_type == DCCP_PKT_RESET) goto discard; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d859a5c36e70..409d0cfd3447 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -904,7 +904,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), - .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, @@ -1018,9 +1017,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net) inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); } +static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET); +} + static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, + .exit_batch = dccp_v4_exit_batch, }; static int __init dccp_v4_init(void) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index adfc790f7193..233b57367758 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -227,7 +227,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -281,7 +281,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; @@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -1077,9 +1075,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); } +static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET6); +} + static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, + .exit_batch = dccp_v6_exit_batch, }; static int __init dccp_v6_init(void) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 96e47c539bee..9649238eef40 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,12 +1,13 @@ config HAVE_NET_DSA def_bool y - depends on NETDEVICES && !S390 + depends on INET && NETDEVICES && !S390 # Drivers must select NET_DSA and the appropriate tagging format config NET_DSA tristate "Distributed Switch Architecture" - depends on HAVE_NET_DSA && NET_SWITCHDEV + depends on HAVE_NET_DSA + select NET_SWITCHDEV select PHYLIB ---help--- Say Y if you want to enable support for the hardware switches supported @@ -14,17 +15,6 @@ config NET_DSA if NET_DSA -config NET_DSA_HWMON - bool "Distributed Switch Architecture HWMON support" - default y - depends on HWMON && !(NET_DSA=y && HWMON=m) - ---help--- - Say Y if you want to expose thermal sensor data on switches supported - by the Distributed Switch Architecture. - - Some of those switches contain thermal sensors. This data is available - via the hwmon sysfs interface and exposes the onboard sensors. - # tagging formats config NET_DSA_TAG_BRCM bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index a3380ed0e0be..31d343796251 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o dsa2.o +dsa_core-y += dsa.o slave.o dsa2.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7899919cd9f0..b6d4f6a23f06 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -9,9 +9,7 @@ * (at your option) any later version. */ -#include <linux/ctype.h> #include <linux/device.h> -#include <linux/hwmon.h> #include <linux/list.h> #include <linux/platform_device.h> #include <linux/slab.h> @@ -27,8 +25,6 @@ #include <linux/gpio/consumer.h> #include "dsa_priv.h" -char dsa_driver_version[] = "0.1"; - static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { static DEFINE_MUTEX(dsa_switch_drivers_mutex); static LIST_HEAD(dsa_switch_drivers); -void register_switch_driver(struct dsa_switch_ops *ops) +void register_switch_driver(struct dsa_switch_driver *drv) { mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&ops->list, &dsa_switch_drivers); + list_add_tail(&drv->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(register_switch_driver); -void unregister_switch_driver(struct dsa_switch_ops *ops) +void unregister_switch_driver(struct dsa_switch_driver *drv) { mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&ops->list); + list_del_init(&drv->list); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(unregister_switch_driver); -static struct dsa_switch_ops * +static const struct dsa_switch_ops * dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, const char **_name, void **priv) { - struct dsa_switch_ops *ret; + const struct dsa_switch_ops *ret; struct list_head *list; const char *name; @@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, mutex_lock(&dsa_switch_drivers_mutex); list_for_each(list, &dsa_switch_drivers) { - struct dsa_switch_ops *ops; + const struct dsa_switch_ops *ops; + struct dsa_switch_driver *drv; - ops = list_entry(list, struct dsa_switch_ops, list); + drv = list_entry(list, struct dsa_switch_driver, list); + ops = drv->ops; name = ops->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { @@ -110,109 +108,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, return ret; } -/* hwmon support ************************************************************/ - -#ifdef CONFIG_NET_DSA_HWMON - -static ssize_t temp1_input_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = ds->ops->get_temp(ds, &temp); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", temp * 1000); -} -static DEVICE_ATTR_RO(temp1_input); - -static ssize_t temp1_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = ds->ops->get_temp_limit(ds, &temp); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", temp * 1000); -} - -static ssize_t temp1_max_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = kstrtoint(buf, 0, &temp); - if (ret < 0) - return ret; - - ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); - if (ret < 0) - return ret; - - return count; -} -static DEVICE_ATTR_RW(temp1_max); - -static ssize_t temp1_max_alarm_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - bool alarm; - int ret; - - ret = ds->ops->get_temp_alarm(ds, &alarm); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", alarm); -} -static DEVICE_ATTR_RO(temp1_max_alarm); - -static struct attribute *dsa_hwmon_attrs[] = { - &dev_attr_temp1_input.attr, /* 0 */ - &dev_attr_temp1_max.attr, /* 1 */ - &dev_attr_temp1_max_alarm.attr, /* 2 */ - NULL -}; - -static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, - struct attribute *attr, int index) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct dsa_switch *ds = dev_get_drvdata(dev); - struct dsa_switch_ops *ops = ds->ops; - umode_t mode = attr->mode; - - if (index == 1) { - if (!ops->get_temp_limit) - mode = 0; - else if (!ops->set_temp_limit) - mode &= ~S_IWUSR; - } else if (index == 2 && !ops->get_temp_alarm) { - mode = 0; - } - return mode; -} - -static const struct attribute_group dsa_hwmon_group = { - .attrs = dsa_hwmon_attrs, - .is_visible = dsa_hwmon_attrs_visible, -}; -__ATTRIBUTE_GROUPS(dsa_hwmon); - -#endif /* CONFIG_NET_DSA_HWMON */ - /* basic switch operations **************************************************/ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, - struct device_node *port_dn, int port) + struct dsa_port *dport, int port) { + struct device_node *port_dn = dport->dn; struct phy_device *phydev; int ret, mode; @@ -242,15 +142,15 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) { - struct device_node *port_dn; + struct dsa_port *dport; int ret, port; - for (port = 0; port < DSA_MAX_PORTS; port++) { + for (port = 0; port < ds->num_ports; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - port_dn = ds->ports[port].dn; - ret = dsa_cpu_dsa_setup(ds, dev, port_dn, port); + dport = &ds->ports[port]; + ret = dsa_cpu_dsa_setup(ds, dev, dport, port); if (ret) return ret; } @@ -308,7 +208,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_switch_ops *ops = ds->ops; + const struct dsa_switch_ops *ops = ds->ops; struct dsa_switch_tree *dst = ds->dst; struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; @@ -318,7 +218,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) /* * Validate supplied switch configuration. */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < ds->num_ports; i++) { char *name; name = cd->port_names[i]; @@ -326,13 +226,12 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) continue; if (!strcmp(name, "cpu")) { - if (dst->cpu_switch != -1) { + if (dst->cpu_switch) { netdev_err(dst->master_netdev, "multiple cpu ports?!\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } - dst->cpu_switch = index; + dst->cpu_switch = ds; dst->cpu_port = i; ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { @@ -343,10 +242,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) valid_name_found = true; } - if (!valid_name_found && i == DSA_MAX_PORTS) { - ret = -EINVAL; - goto out; - } + if (!valid_name_found && i == ds->num_ports) + return -EINVAL; /* Make the built-in MII bus mask match the number of ports, * switch drivers can override this later @@ -358,15 +255,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * tagging protocol to the preferred tagging format of this * switch. */ - if (dst->cpu_switch == index) { + if (dst->cpu_switch == ds) { enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { - ret = PTR_ERR(dst->tag_ops); - goto out; - } + if (IS_ERR(dst->tag_ops)) + return PTR_ERR(dst->tag_ops); dst->rcv = dst->tag_ops->rcv; } @@ -378,85 +273,55 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) */ ret = ops->setup(ds); if (ret < 0) - goto out; + return ret; + + ret = dsa_switch_register_notifier(ds); + if (ret) + return ret; if (ops->set_addr) { ret = ops->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) - goto out; + return ret; } if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); - if (!ds->slave_mii_bus) { - ret = -ENOMEM; - goto out; - } + if (!ds->slave_mii_bus) + return -ENOMEM; dsa_slave_mii_bus_init(ds); ret = mdiobus_register(ds->slave_mii_bus); if (ret < 0) - goto out; + return ret; } /* * Create network devices for physical switch ports. */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < ds->num_ports; i++) { ds->ports[i].dn = cd->port_dn[i]; if (!(ds->enabled_port_mask & (1 << i))) continue; ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); - if (ret < 0) { + if (ret < 0) netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", index, i, cd->port_names[i], ret); - ret = 0; - } } /* Perform configuration of the CPU and DSA ports */ ret = dsa_cpu_dsa_setups(ds, parent); - if (ret < 0) { + if (ret < 0) netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", index); - ret = 0; - } ret = dsa_cpu_port_ethtool_setup(ds); if (ret) return ret; -#ifdef CONFIG_NET_DSA_HWMON - /* If the switch provides a temperature sensor, - * register with hardware monitoring subsystem. - * Treat registration error as non-fatal and ignore it. - */ - if (ops->get_temp) { - const char *netname = netdev_name(dst->master_netdev); - char hname[IFNAMSIZ + 1]; - int i, j; - - /* Create valid hwmon 'name' attribute */ - for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) { - if (isalnum(netname[i])) - hname[j++] = netname[i]; - } - hname[j] = '\0'; - scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", - hname, index); - ds->hwmon_dev = hwmon_device_register_with_groups(NULL, - ds->hwmon_name, ds, dsa_hwmon_groups); - if (IS_ERR(ds->hwmon_dev)) - ds->hwmon_dev = NULL; - } -#endif /* CONFIG_NET_DSA_HWMON */ - - return ret; - -out: - return ret; + return 0; } static struct dsa_switch * @@ -464,7 +329,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { struct dsa_chip_data *cd = dst->pd->chip + index; - struct dsa_switch_ops *ops; + const struct dsa_switch_ops *ops; struct dsa_switch *ds; int ret; const char *name; @@ -486,8 +351,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Allocate and initialise switch state. */ - ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL); - if (ds == NULL) + ds = dsa_switch_alloc(parent, DSA_MAX_PORTS); + if (!ds) return ERR_PTR(-ENOMEM); ds->dst = dst; @@ -495,7 +360,6 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->cd = cd; ds->ops = ops; ds->priv = priv; - ds->dev = parent; ret = dsa_switch_setup_one(ds, parent); if (ret) @@ -504,8 +368,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, return ds; } -void dsa_cpu_dsa_destroy(struct device_node *port_dn) +void dsa_cpu_dsa_destroy(struct dsa_port *port) { + struct device_node *port_dn = port->dn; + if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); } @@ -514,13 +380,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { int port; -#ifdef CONFIG_NET_DSA_HWMON - if (ds->hwmon_dev) - hwmon_device_unregister(ds->hwmon_dev); -#endif - /* Destroy network devices for physical switch ports. */ - for (port = 0; port < DSA_MAX_PORTS; port++) { + for (port = 0; port < ds->num_ports; port++) { if (!(ds->enabled_port_mask & (1 << port))) continue; @@ -531,10 +392,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) } /* Disable configuration of the CPU and DSA ports */ - for (port = 0; port < DSA_MAX_PORTS; port++) { + for (port = 0; port < ds->num_ports; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - dsa_cpu_dsa_destroy(ds->ports[port].dn); + dsa_cpu_dsa_destroy(&ds->ports[port]); /* Clearing a bit which is not set does no harm */ ds->cpu_port_mask |= ~(1 << port); @@ -543,6 +404,8 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); + + dsa_switch_unregister_notifier(ds); } #ifdef CONFIG_PM_SLEEP @@ -551,7 +414,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) int i, ret = 0; /* Suspend slave network devices */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < ds->num_ports; i++) { if (!dsa_is_port_initialized(ds, i)) continue; @@ -578,7 +441,7 @@ int dsa_switch_resume(struct dsa_switch *ds) return ret; /* Resume slave network devices */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < ds->num_ports; i++) { if (!dsa_is_port_initialized(ds, i)) continue; @@ -629,7 +492,7 @@ struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) } EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); -static struct net_device *dev_to_net_device(struct device *dev) +struct net_device *dsa_dev_to_net_device(struct device *dev) { struct device *d; @@ -646,6 +509,7 @@ static struct net_device *dev_to_net_device(struct device *dev) return NULL; } +EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); #ifdef CONFIG_OF static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, @@ -898,7 +762,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->pd = pd; dst->master_netdev = dev; - dst->cpu_switch = -1; dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { @@ -940,9 +803,6 @@ static int dsa_probe(struct platform_device *pdev) struct dsa_switch_tree *dst; int ret; - pr_notice_once("Distributed Switch Architecture driver version %s\n", - dsa_driver_version); - if (pdev->dev.of_node) { ret = dsa_of_probe(&pdev->dev); if (ret) @@ -958,7 +818,7 @@ static int dsa_probe(struct platform_device *pdev) dev = pd->of_netdev; dev_hold(dev); } else { - dev = dev_to_net_device(pd->netdev); + dev = dsa_dev_to_net_device(pd->netdev); } if (dev == NULL) { ret = -EPROBE_DEFER; @@ -1013,7 +873,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->ds[0]); + dsa_cpu_port_ethtool_restore(dst->cpu_switch); dev_put(dst->master_netdev); } @@ -1050,10 +910,6 @@ static struct packet_type dsa_pack_type __read_mostly = { .func = dsa_switch_rcv, }; -static struct notifier_block dsa_netdevice_nb __read_mostly = { - .notifier_call = dsa_slave_netdevice_event, -}; - #ifdef CONFIG_PM_SLEEP static int dsa_suspend(struct device *d) { @@ -1111,7 +967,9 @@ static int __init dsa_init_module(void) { int rc; - register_netdevice_notifier(&dsa_netdevice_nb); + rc = dsa_slave_register_notifier(); + if (rc) + return rc; rc = platform_driver_register(&dsa_driver); if (rc) @@ -1125,7 +983,7 @@ module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { - unregister_netdevice_notifier(&dsa_netdevice_nb); + dsa_slave_unregister_notifier(); dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5fff951a0a49..737be6470c7f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -57,7 +57,6 @@ static struct dsa_switch_tree *dsa_add_dst(u32 tree) if (!dst) return NULL; dst->tree = tree; - dst->cpu_switch = -1; INIT_LIST_HEAD(&dst->list); list_add_tail(&dsa_switch_trees, &dst->list); kref_init(&dst->refcount); @@ -79,47 +78,43 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst, kref_put(&dst->refcount, dsa_free_dst); } -static bool dsa_port_is_dsa(struct device_node *port) +/* For platform data configurations, we need to have a valid name argument to + * differentiate a disabled port from an enabled one + */ +static bool dsa_port_is_valid(struct dsa_port *port) { - const char *name; - - name = of_get_property(port, "label", NULL); - if (!name) - return false; + return !!(port->dn || port->name); +} - if (!strcmp(name, "dsa")) +static bool dsa_port_is_dsa(struct dsa_port *port) +{ + if (port->name && !strcmp(port->name, "dsa")) return true; - - return false; + else + return !!of_parse_phandle(port->dn, "link", 0); } -static bool dsa_port_is_cpu(struct device_node *port) +static bool dsa_port_is_cpu(struct dsa_port *port) { - const char *name; - - name = of_get_property(port, "label", NULL); - if (!name) - return false; - - if (!strcmp(name, "cpu")) + if (port->name && !strcmp(port->name, "cpu")) return true; - - return false; + else + return !!of_parse_phandle(port->dn, "ethernet", 0); } -static bool dsa_ds_find_port(struct dsa_switch *ds, - struct device_node *port) +static bool dsa_ds_find_port_dn(struct dsa_switch *ds, + struct device_node *port) { u32 index; - for (index = 0; index < DSA_MAX_PORTS; index++) + for (index = 0; index < ds->num_ports; index++) if (ds->ports[index].dn == port) return true; return false; } -static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, - struct device_node *port) +static struct dsa_switch *dsa_dst_find_port_dn(struct dsa_switch_tree *dst, + struct device_node *port) { struct dsa_switch *ds; u32 index; @@ -129,7 +124,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, if (!ds) continue; - if (dsa_ds_find_port(ds, port)) + if (dsa_ds_find_port_dn(ds, port)) return ds; } @@ -138,7 +133,7 @@ static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, static int dsa_port_complete(struct dsa_switch_tree *dst, struct dsa_switch *src_ds, - struct device_node *port, + struct dsa_port *port, u32 src_port) { struct device_node *link; @@ -146,11 +141,11 @@ static int dsa_port_complete(struct dsa_switch_tree *dst, struct dsa_switch *dst_ds; for (index = 0;; index++) { - link = of_parse_phandle(port, "link", index); + link = of_parse_phandle(port->dn, "link", index); if (!link) break; - dst_ds = dsa_dst_find_port(dst, link); + dst_ds = dsa_dst_find_port_dn(dst, link); of_node_put(link); if (!dst_ds) @@ -169,13 +164,13 @@ static int dsa_port_complete(struct dsa_switch_tree *dst, */ static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) { - struct device_node *port; + struct dsa_port *port; u32 index; int err; - for (index = 0; index < DSA_MAX_PORTS; index++) { - port = ds->ports[index].dn; - if (!port) + for (index = 0; index < ds->num_ports; index++) { + port = &ds->ports[index]; + if (!dsa_port_is_valid(port)) continue; if (!dsa_port_is_dsa(port)) @@ -215,7 +210,7 @@ static int dsa_dst_complete(struct dsa_switch_tree *dst) return 0; } -static int dsa_dsa_port_apply(struct device_node *port, u32 index, +static int dsa_dsa_port_apply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { int err; @@ -230,13 +225,13 @@ static int dsa_dsa_port_apply(struct device_node *port, u32 index, return 0; } -static void dsa_dsa_port_unapply(struct device_node *port, u32 index, +static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { dsa_cpu_dsa_destroy(port); } -static int dsa_cpu_port_apply(struct device_node *port, u32 index, +static int dsa_cpu_port_apply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { int err; @@ -253,7 +248,7 @@ static int dsa_cpu_port_apply(struct device_node *port, u32 index, return 0; } -static void dsa_cpu_port_unapply(struct device_node *port, u32 index, +static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { dsa_cpu_dsa_destroy(port); @@ -261,25 +256,29 @@ static void dsa_cpu_port_unapply(struct device_node *port, u32 index, } -static int dsa_user_port_apply(struct device_node *port, u32 index, +static int dsa_user_port_apply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { - const char *name; + const char *name = port->name; int err; - name = of_get_property(port, "label", NULL); + if (port->dn) + name = of_get_property(port->dn, "label", NULL); + if (!name) + name = "eth%d"; err = dsa_slave_create(ds, ds->dev, index, name); if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", index, err); + ds->ports[index].netdev = NULL; return err; } return 0; } -static void dsa_user_port_unapply(struct device_node *port, u32 index, +static void dsa_user_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { if (ds->ports[index].netdev) { @@ -291,7 +290,7 @@ static void dsa_user_port_unapply(struct device_node *port, u32 index, static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) { - struct device_node *port; + struct dsa_port *port; u32 index; int err; @@ -306,6 +305,10 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err < 0) return err; + err = dsa_switch_register_notifier(ds); + if (err) + return err; + if (ds->ops->set_addr) { err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); if (err < 0) @@ -324,9 +327,9 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) return err; } - for (index = 0; index < DSA_MAX_PORTS; index++) { - port = ds->ports[index].dn; - if (!port) + for (index = 0; index < ds->num_ports; index++) { + port = &ds->ports[index]; + if (!dsa_port_is_valid(port)) continue; if (dsa_port_is_dsa(port)) { @@ -353,12 +356,12 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) { - struct device_node *port; + struct dsa_port *port; u32 index; - for (index = 0; index < DSA_MAX_PORTS; index++) { - port = ds->ports[index].dn; - if (!port) + for (index = 0; index < ds->num_ports; index++) { + port = &ds->ports[index]; + if (!dsa_port_is_valid(port)) continue; if (dsa_port_is_dsa(port)) { @@ -376,6 +379,8 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); + + dsa_switch_unregister_notifier(ds); } static int dsa_dst_apply(struct dsa_switch_tree *dst) @@ -394,9 +399,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->ds[0]); - if (err) - return err; + if (dst->cpu_switch) { + err = dsa_cpu_port_ethtool_setup(dst->cpu_switch); + if (err) + return err; + } /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -433,13 +440,14 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->ds[0]); + if (dst->cpu_switch) + dsa_cpu_port_ethtool_restore(dst->cpu_switch); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; } -static int dsa_cpu_parse(struct device_node *port, u32 index, +static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { @@ -447,11 +455,16 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, struct net_device *ethernet_dev; struct device_node *ethernet; - ethernet = of_parse_phandle(port, "ethernet", 0); - if (!ethernet) - return -EINVAL; + if (port->dn) { + ethernet = of_parse_phandle(port->dn, "ethernet", 0); + if (!ethernet) + return -EINVAL; + ethernet_dev = of_find_net_device_by_node(ethernet); + } else { + ethernet_dev = dsa_dev_to_net_device(ds->cd->netdev[index]); + dev_put(ethernet_dev); + } - ethernet_dev = of_find_net_device_by_node(ethernet); if (!ethernet_dev) return -EPROBE_DEFER; @@ -461,8 +474,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, if (!dst->master_netdev) dst->master_netdev = ethernet_dev; - if (dst->cpu_switch == -1) { - dst->cpu_switch = ds->index; + if (!dst->cpu_switch) { + dst->cpu_switch = ds; dst->cpu_port = index; } @@ -480,13 +493,13 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) { - struct device_node *port; + struct dsa_port *port; u32 index; int err; - for (index = 0; index < DSA_MAX_PORTS; index++) { - port = ds->ports[index].dn; - if (!port) + for (index = 0; index < ds->num_ports; index++) { + port = &ds->ports[index]; + if (!dsa_port_is_valid(port)) continue; if (dsa_port_is_cpu(port)) { @@ -538,7 +551,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) if (err) return err; - if (reg >= DSA_MAX_PORTS) + if (reg >= ds->num_ports) return -EINVAL; ds->ports[reg].dn = port; @@ -547,14 +560,41 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) * to have access to a correct value, just like what * net/dsa/dsa.c::dsa_switch_setup_one does. */ - if (!dsa_port_is_cpu(port)) + if (!dsa_port_is_cpu(&ds->ports[reg])) ds->enabled_port_mask |= 1 << reg; } return 0; } -static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) +static int dsa_parse_ports(struct dsa_chip_data *cd, struct dsa_switch *ds) +{ + bool valid_name_found = false; + unsigned int i; + + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!cd->port_names[i]) + continue; + + ds->ports[i].name = cd->port_names[i]; + + /* Initialize enabled_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + if (!dsa_port_is_cpu(&ds->ports[i])) + ds->enabled_port_mask |= 1 << i; + + valid_name_found = true; + } + + if (!valid_name_found && i == DSA_MAX_PORTS) + return -EINVAL; + + return 0; +} + +static int dsa_parse_member_dn(struct device_node *np, u32 *tree, u32 *index) { int err; @@ -578,6 +618,18 @@ static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) return 0; } +static int dsa_parse_member(struct dsa_chip_data *pd, u32 *tree, u32 *index) +{ + if (!pd) + return -ENODEV; + + /* We do not support complex trees with dsa_chip_data */ + *tree = 0; + *index = 0; + + return 0; +} + static struct device_node *dsa_get_ports(struct dsa_switch *ds, struct device_node *np) { @@ -592,23 +644,36 @@ static struct device_node *dsa_get_ports(struct dsa_switch *ds, return ports; } -static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev) { - struct device_node *ports = dsa_get_ports(ds, np); + struct dsa_chip_data *pdata = dev->platform_data; + struct device_node *np = dev->of_node; struct dsa_switch_tree *dst; + struct device_node *ports; u32 tree, index; int i, err; - err = dsa_parse_member(np, &tree, &index); - if (err) - return err; + if (np) { + err = dsa_parse_member_dn(np, &tree, &index); + if (err) + return err; - if (IS_ERR(ports)) - return PTR_ERR(ports); + ports = dsa_get_ports(ds, np); + if (IS_ERR(ports)) + return PTR_ERR(ports); - err = dsa_parse_ports_dn(ports, ds); - if (err) - return err; + err = dsa_parse_ports_dn(ports, ds); + if (err) + return err; + } else { + err = dsa_parse_member(pdata, &tree, &index); + if (err) + return err; + + err = dsa_parse_ports(pdata, ds); + if (err) + return err; + } dst = dsa_get_dst(tree); if (!dst) { @@ -624,6 +689,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) ds->dst = dst; ds->index = index; + ds->cd = pdata; /* Initialize the routing table */ for (i = 0; i < DSA_MAX_SWITCHES; ++i) @@ -647,8 +713,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) } err = dsa_dst_parse(dst); - if (err) + if (err) { + if (err == -EPROBE_DEFER) { + dsa_dst_del_ds(dst, ds, ds->index); + return err; + } + goto out_del_dst; + } err = dsa_dst_apply(dst); if (err) { @@ -667,12 +739,34 @@ out: return err; } -int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) +{ + size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port); + struct dsa_switch *ds; + int i; + + ds = devm_kzalloc(dev, size, GFP_KERNEL); + if (!ds) + return NULL; + + ds->dev = dev; + ds->num_ports = n; + + for (i = 0; i < ds->num_ports; ++i) { + ds->ports[i].index = i; + ds->ports[i].ds = ds; + } + + return ds; +} +EXPORT_SYMBOL_GPL(dsa_switch_alloc); + +int dsa_register_switch(struct dsa_switch *ds, struct device *dev) { int err; mutex_lock(&dsa2_mutex); - err = _dsa_register_switch(ds, np); + err = _dsa_register_switch(ds, dev); mutex_unlock(&dsa2_mutex); return err; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 6cfd7388834e..0706a511244e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -25,12 +25,8 @@ struct dsa_slave_priv { struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); - /* - * Which switch this port is a part of, and the port index - * for this port. - */ - struct dsa_switch *parent; - u8 port; + /* DSA port data, such as switch, port index, etc. */ + struct dsa_port *dp; /* * The phylib phy_device pointer for the PHY connected @@ -42,17 +38,18 @@ struct dsa_slave_priv { int old_pause; int old_duplex; - struct net_device *bridge_dev; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + + /* TC context */ + struct list_head mall_tc_list; }; /* dsa.c */ -extern char dsa_driver_version[]; int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, - struct device_node *port_dn, int port); -void dsa_cpu_dsa_destroy(struct device_node *port_dn); + struct dsa_port *dport, int port); +void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); @@ -66,8 +63,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); -int dsa_slave_netdevice_event(struct notifier_block *unused, - unsigned long event, void *ptr); +int dsa_slave_register_notifier(void); +void dsa_slave_unregister_notifier(void); + +/* switch.c */ +int dsa_switch_register_notifier(struct dsa_switch *ds); +void dsa_switch_unregister_notifier(struct dsa_switch *ds); /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 68c9eea00518..c34872e1febc 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -16,12 +16,28 @@ #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> +#include <linux/list.h> #include <net/rtnetlink.h> #include <net/switchdev.h> +#include <net/pkt_cls.h> +#include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/netpoll.h> #include "dsa_priv.h" +static bool dsa_slave_dev_check(struct net_device *dev); + +static int dsa_slave_notify(struct net_device *dev, unsigned long e, void *v) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct raw_notifier_head *nh = &p->dp->ds->dst->nh; + int err; + + err = raw_notifier_call_chain(nh, e, v); + + return notifier_to_errno(err); +} + /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -61,17 +77,20 @@ static int dsa_slave_get_iflink(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - return p->parent->dst->master_netdev->ifindex; + return p->dp->ds->dst->master_netdev->ifindex; } -static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) +static inline bool dsa_port_is_bridged(struct dsa_port *dp) { - return !!p->bridge_dev; + return !!dp->bridge_dev; } -static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) +static void dsa_slave_set_state(struct net_device *dev, u8 state) { - struct dsa_port *dp = &ds->ports[port]; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = p->dp; + struct dsa_switch *ds = dp->ds; + int port = dp->index; if (ds->ops->port_stp_state_set) ds->ops->port_stp_state_set(ds, port, state); @@ -96,9 +115,9 @@ static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->dst->master_netdev; - struct dsa_switch *ds = p->parent; - u8 stp_state = dsa_port_is_bridged(p) ? + struct net_device *master = p->dp->ds->dst->master_netdev; + struct dsa_switch *ds = p->dp->ds; + u8 stp_state = dsa_port_is_bridged(p->dp) ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; @@ -123,12 +142,12 @@ static int dsa_slave_open(struct net_device *dev) } if (ds->ops->port_enable) { - err = ds->ops->port_enable(ds, p->port, p->phy); + err = ds->ops->port_enable(ds, p->dp->index, p->phy); if (err) goto clear_promisc; } - dsa_port_set_stp_state(ds, p->port, stp_state); + dsa_slave_set_state(dev, stp_state); if (p->phy) phy_start(p->phy); @@ -151,8 +170,8 @@ out: static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->dst->master_netdev; - struct dsa_switch *ds = p->parent; + struct net_device *master = p->dp->ds->dst->master_netdev; + struct dsa_switch *ds = p->dp->ds; if (p->phy) phy_stop(p->phy); @@ -168,9 +187,9 @@ static int dsa_slave_close(struct net_device *dev) dev_uc_del(master, dev->dev_addr); if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->port, p->phy); + ds->ops->port_disable(ds, p->dp->index, p->phy); - dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED); + dsa_slave_set_state(dev, BR_STATE_DISABLED); return 0; } @@ -178,7 +197,7 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->dst->master_netdev; + struct net_device *master = p->dp->ds->dst->master_netdev; if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -189,7 +208,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) static void dsa_slave_set_rx_mode(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->dst->master_netdev; + struct net_device *master = p->dp->ds->dst->master_netdev; dev_mc_sync(master, dev); dev_uc_sync(master, dev); @@ -198,7 +217,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev) static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = p->parent->dst->master_netdev; + struct net_device *master = p->dp->ds->dst->master_netdev; struct sockaddr *addr = a; int err; @@ -228,16 +247,17 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_port *dp = p->dp; + struct dsa_switch *ds = dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) return -EOPNOTSUPP; - return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); + return ds->ops->port_vlan_prepare(ds, dp->index, vlan, trans); } - ds->ops->port_vlan_add(ds, p->port, vlan, trans); + ds->ops->port_vlan_add(ds, dp->index, vlan, trans); return 0; } @@ -246,12 +266,12 @@ static int dsa_slave_port_vlan_del(struct net_device *dev, const struct switchdev_obj_port_vlan *vlan) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - return ds->ops->port_vlan_del(ds, p->port, vlan); + return ds->ops->port_vlan_del(ds, p->dp->index, vlan); } static int dsa_slave_port_vlan_dump(struct net_device *dev, @@ -259,10 +279,10 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, switchdev_obj_dump_cb_t *cb) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->port_vlan_dump) - return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); + return ds->ops->port_vlan_dump(ds, p->dp->index, vlan, cb); return -EOPNOTSUPP; } @@ -272,16 +292,16 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); + return ds->ops->port_fdb_prepare(ds, p->dp->index, fdb, trans); } - ds->ops->port_fdb_add(ds, p->port, fdb, trans); + ds->ops->port_fdb_add(ds, p->dp->index, fdb, trans); return 0; } @@ -290,11 +310,11 @@ static int dsa_slave_port_fdb_del(struct net_device *dev, const struct switchdev_obj_port_fdb *fdb) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; int ret = -EOPNOTSUPP; if (ds->ops->port_fdb_del) - ret = ds->ops->port_fdb_del(ds, p->port, fdb); + ret = ds->ops->port_fdb_del(ds, p->dp->index, fdb); return ret; } @@ -304,10 +324,10 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, switchdev_obj_dump_cb_t *cb) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->port_fdb_dump) - return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); + return ds->ops->port_fdb_dump(ds, p->dp->index, fdb, cb); return -EOPNOTSUPP; } @@ -317,16 +337,16 @@ static int dsa_slave_port_mdb_add(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (switchdev_trans_ph_prepare(trans)) { if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) return -EOPNOTSUPP; - return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans); + return ds->ops->port_mdb_prepare(ds, p->dp->index, mdb, trans); } - ds->ops->port_mdb_add(ds, p->port, mdb, trans); + ds->ops->port_mdb_add(ds, p->dp->index, mdb, trans); return 0; } @@ -335,10 +355,10 @@ static int dsa_slave_port_mdb_del(struct net_device *dev, const struct switchdev_obj_port_mdb *mdb) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->port_mdb_del) - return ds->ops->port_mdb_del(ds, p->port, mdb); + return ds->ops->port_mdb_del(ds, p->dp->index, mdb); return -EOPNOTSUPP; } @@ -348,10 +368,10 @@ static int dsa_slave_port_mdb_dump(struct net_device *dev, switchdev_obj_dump_cb_t *cb) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->port_mdb_dump) - return ds->ops->port_mdb_dump(ds, p->port, mdb, cb); + return ds->ops->port_mdb_dump(ds, p->dp->index, mdb, cb); return -EOPNOTSUPP; } @@ -371,12 +391,12 @@ static int dsa_slave_stp_state_set(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (switchdev_trans_ph_prepare(trans)) return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - dsa_port_set_stp_state(ds, p->port, attr->u.stp_state); + dsa_slave_set_state(dev, attr->u.stp_state); return 0; } @@ -386,14 +406,14 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ if (switchdev_trans_ph_prepare(trans)) return 0; if (ds->ops->port_vlan_filtering) - return ds->ops->port_vlan_filtering(ds, p->port, + return ds->ops->port_vlan_filtering(ds, p->dp->index, attr->u.vlan_filtering); return 0; @@ -404,7 +424,7 @@ static int dsa_fastest_ageing_time(struct dsa_switch *ds, { int i; - for (i = 0; i < DSA_MAX_PORTS; ++i) { + for (i = 0; i < ds->num_ports; ++i) { struct dsa_port *dp = &ds->ports[i]; if (dp && dp->ageing_time && dp->ageing_time < ageing_time) @@ -419,7 +439,7 @@ static int dsa_slave_ageing_time(struct net_device *dev, struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); @@ -428,7 +448,7 @@ static int dsa_slave_ageing_time(struct net_device *dev, return 0; /* Keep the fastest ageing time in case of multiple bridges */ - ds->ports[p->port].ageing_time = ageing_time; + p->dp->ageing_time = ageing_time; ageing_time = dsa_fastest_ageing_time(ds, ageing_time); if (ds->ops->set_ageing_time) @@ -553,39 +573,58 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; + struct dsa_notifier_bridge_info info = { + .sw_index = p->dp->ds->index, + .port = p->dp->index, + .br = br, + }; + int err; + + /* Here the port is already bridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + p->dp->bridge_dev = br; - p->bridge_dev = br; + err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_JOIN, &info); - if (ds->ops->port_bridge_join) - ret = ds->ops->port_bridge_join(ds, p->port, br); + /* The bridging is rolled back on error */ + if (err) + p->dp->bridge_dev = NULL; - return ret == -EOPNOTSUPP ? 0 : ret; + return err; } -static void dsa_slave_bridge_port_leave(struct net_device *dev) +static void dsa_slave_bridge_port_leave(struct net_device *dev, + struct net_device *br) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - + struct dsa_notifier_bridge_info info = { + .sw_index = p->dp->ds->index, + .port = p->dp->index, + .br = br, + }; + int err; - if (ds->ops->port_bridge_leave) - ds->ops->port_bridge_leave(ds, p->port); + /* Here the port is already unbridged. Reflect the current configuration + * so that drivers can program their chips accordingly. + */ + p->dp->bridge_dev = NULL; - p->bridge_dev = NULL; + err = dsa_slave_notify(dev, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + if (err) + netdev_err(dev, "failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING); + dsa_slave_set_state(dev, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: @@ -633,7 +672,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ - nskb->dev = p->parent->dst->master_netdev; + nskb->dev = p->dp->ds->dst->master_netdev; dev_queue_xmit(nskb); return NETDEV_TX_OK; @@ -645,14 +684,10 @@ dsa_slave_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_slave_priv *p = netdev_priv(dev); - int err; + int err = -EOPNOTSUPP; - err = -EOPNOTSUPP; - if (p->phy != NULL) { - err = phy_read_status(p->phy); - if (err == 0) - err = phy_ethtool_ksettings_get(p->phy, cmd); - } + if (p->phy != NULL) + err = phy_ethtool_ksettings_get(p->phy, cmd); return err; } @@ -673,7 +708,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); - strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version)); strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } @@ -681,10 +715,10 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, static int dsa_slave_get_regs_len(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->get_regs_len) - return ds->ops->get_regs_len(ds, p->port); + return ds->ops->get_regs_len(ds, p->dp->index); return -EOPNOTSUPP; } @@ -693,10 +727,10 @@ static void dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->get_regs) - ds->ops->get_regs(ds, p->port, regs, _p); + ds->ops->get_regs(ds, p->dp->index, regs, _p); } static int dsa_slave_nway_reset(struct net_device *dev) @@ -724,7 +758,7 @@ static u32 dsa_slave_get_link(struct net_device *dev) static int dsa_slave_get_eeprom_len(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; @@ -739,7 +773,7 @@ static int dsa_slave_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); @@ -751,7 +785,7 @@ static int dsa_slave_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); @@ -763,7 +797,7 @@ static void dsa_slave_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; @@ -773,7 +807,7 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) - ds->ops->get_strings(ds, p->port, data + 4 * len); + ds->ops->get_strings(ds, p->dp->index, data + 4 * len); } } @@ -782,7 +816,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->ds[0]; + struct dsa_switch *ds = dst->cpu_switch; s8 cpu_port = dst->cpu_port; int count = 0; @@ -799,7 +833,7 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->ds[0]; + struct dsa_switch *ds = dst->cpu_switch; int count = 0; if (dst->master_ethtool_ops.get_sset_count) @@ -815,7 +849,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds = dst->ds[0]; + struct dsa_switch *ds = dst->cpu_switch; s8 cpu_port = dst->cpu_port; int len = ETH_GSTRING_LEN; int mcount = 0, count; @@ -854,20 +888,20 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; data[0] = dev->stats.tx_packets; data[1] = dev->stats.tx_bytes; data[2] = dev->stats.rx_packets; data[3] = dev->stats.rx_bytes; if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, p->port, data + 4); + ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (sset == ETH_SS_STATS) { int count; @@ -885,20 +919,20 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; if (ds->ops->get_wol) - ds->ops->get_wol(ds, p->port, w); + ds->ops->get_wol(ds, p->dp->index, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; int ret = -EOPNOTSUPP; if (ds->ops->set_wol) - ret = ds->ops->set_wol(ds, p->port, w); + ret = ds->ops->set_wol(ds, p->dp->index, w); return ret; } @@ -906,13 +940,13 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; int ret; if (!ds->ops->set_eee) return -EOPNOTSUPP; - ret = ds->ops->set_eee(ds, p->port, p->phy, e); + ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e); if (ret) return ret; @@ -925,13 +959,13 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; int ret; if (!ds->ops->get_eee) return -EOPNOTSUPP; - ret = ds->ops->get_eee(ds, p->port, e); + ret = ds->ops->get_eee(ds, p->dp->index, e); if (ret) return ret; @@ -946,7 +980,7 @@ static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; struct net_device *master = ds->dst->master_netdev; struct netpoll *netpoll; int err = 0; @@ -984,6 +1018,144 @@ static void dsa_slave_poll_controller(struct net_device *dev) } #endif +static int dsa_slave_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (snprintf(name, len, "p%d", p->dp->index) >= len) + return -EINVAL; + + return 0; +} + +static struct dsa_mall_tc_entry * +dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p, + unsigned long cookie) +{ + struct dsa_mall_tc_entry *mall_tc_entry; + + list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) + if (mall_tc_entry->cookie == cookie) + return mall_tc_entry; + + return NULL; +} + +static int dsa_slave_add_cls_matchall(struct net_device *dev, + __be16 protocol, + struct tc_cls_matchall_offload *cls, + bool ingress) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_mall_tc_entry *mall_tc_entry; + struct dsa_switch *ds = p->dp->ds; + struct net *net = dev_net(dev); + struct dsa_slave_priv *to_p; + struct net_device *to_dev; + const struct tc_action *a; + int err = -EOPNOTSUPP; + LIST_HEAD(actions); + int ifindex; + + if (!ds->ops->port_mirror_add) + return err; + + if (!tc_single_action(cls->exts)) + return err; + + tcf_exts_to_list(cls->exts, &actions); + a = list_first_entry(&actions, struct tc_action, list); + + if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) { + struct dsa_mall_mirror_tc_entry *mirror; + + ifindex = tcf_mirred_ifindex(a); + to_dev = __dev_get_by_index(net, ifindex); + if (!to_dev) + return -EINVAL; + + if (!dsa_slave_dev_check(to_dev)) + return -EOPNOTSUPP; + + mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); + if (!mall_tc_entry) + return -ENOMEM; + + mall_tc_entry->cookie = cls->cookie; + mall_tc_entry->type = DSA_PORT_MALL_MIRROR; + mirror = &mall_tc_entry->mirror; + + to_p = netdev_priv(to_dev); + + mirror->to_local_port = to_p->dp->index; + mirror->ingress = ingress; + + err = ds->ops->port_mirror_add(ds, p->dp->index, mirror, + ingress); + if (err) { + kfree(mall_tc_entry); + return err; + } + + list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); + } + + return 0; +} + +static void dsa_slave_del_cls_matchall(struct net_device *dev, + struct tc_cls_matchall_offload *cls) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_mall_tc_entry *mall_tc_entry; + struct dsa_switch *ds = p->dp->ds; + + if (!ds->ops->port_mirror_del) + return; + + mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie); + if (!mall_tc_entry) + return; + + list_del(&mall_tc_entry->list); + + switch (mall_tc_entry->type) { + case DSA_PORT_MALL_MIRROR: + ds->ops->port_mirror_del(ds, p->dp->index, + &mall_tc_entry->mirror); + break; + default: + WARN_ON(1); + } + + kfree(mall_tc_entry); +} + +static int dsa_slave_setup_tc(struct net_device *dev, u32 handle, + __be16 protocol, struct tc_to_netdev *tc) +{ + bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS); + int ret = -EOPNOTSUPP; + + switch (tc->type) { + case TC_SETUP_MATCHALL: + switch (tc->cls_mall->command) { + case TC_CLSMATCHALL_REPLACE: + return dsa_slave_add_cls_matchall(dev, protocol, + tc->cls_mall, + ingress); + case TC_CLSMATCHALL_DESTROY: + dsa_slave_del_cls_matchall(dev, tc->cls_mall); + return 0; + } + default: + break; + } + + return ret; +} + void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) { ops->get_sset_count = dsa_cpu_port_get_sset_count; @@ -991,6 +1163,30 @@ void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) ops->get_strings = dsa_cpu_port_get_strings; } +static int dsa_slave_get_rxnfc(struct net_device *dev, + struct ethtool_rxnfc *nfc, u32 *rule_locs) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + + if (!ds->ops->get_rxnfc) + return -EOPNOTSUPP; + + return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs); +} + +static int dsa_slave_set_rxnfc(struct net_device *dev, + struct ethtool_rxnfc *nfc) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + + if (!ds->ops->set_rxnfc) + return -EOPNOTSUPP; + + return ds->ops->set_rxnfc(ds, p->dp->index, nfc); +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, @@ -1009,6 +1205,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_rxnfc = dsa_slave_get_rxnfc, + .set_rxnfc = dsa_slave_set_rxnfc, }; static const struct net_device_ops dsa_slave_netdev_ops = { @@ -1031,6 +1229,8 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_bridge_getlink = switchdev_port_bridge_getlink, .ndo_bridge_setlink = switchdev_port_bridge_setlink, .ndo_bridge_dellink = switchdev_port_bridge_dellink, + .ndo_get_phys_port_name = dsa_slave_get_phys_port_name, + .ndo_setup_tc = dsa_slave_setup_tc, }; static const struct switchdev_ops dsa_slave_switchdev_ops = { @@ -1048,7 +1248,7 @@ static struct device_type dsa_type = { static void dsa_slave_adjust_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; unsigned int status_changed = 0; if (p->old_link != p->phy->link) { @@ -1067,7 +1267,7 @@ static void dsa_slave_adjust_link(struct net_device *dev) } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->port, p->phy); + ds->ops->adjust_link(ds, p->dp->index, p->phy); if (status_changed) phy_print_status(p->phy); @@ -1081,9 +1281,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, if (dev) { p = netdev_priv(dev); - ds = p->parent; + ds = p->dp->ds; if (ds->ops->fixed_link_update) - ds->ops->fixed_link_update(ds, p->port, status); + ds->ops->fixed_link_update(ds, p->dp->index, status); } return 0; @@ -1094,7 +1294,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, struct net_device *slave_dev, int addr) { - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); if (!p->phy) { @@ -1105,22 +1305,20 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) p->phy_interface = p->phy->interface; - phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); - - return 0; + return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, + p->phy_interface); } static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { - struct dsa_switch *ds = p->parent; + struct dsa_switch *ds = p->dp->ds; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = ds->ports[p->port].dn; + port_dn = p->dp->dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1141,7 +1339,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, } if (ds->ops->get_phy_flags) - phy_flags = ds->ops->get_phy_flags(ds, p->port); + phy_flags = ds->ops->get_phy_flags(ds, p->dp->index); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); @@ -1176,9 +1374,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, * MDIO bus instead */ if (!p->phy) { - ret = dsa_slave_phy_connect(p, slave_dev, p->port); + ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index); if (ret) { - netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); + netdev_err(slave_dev, "failed to connect to port %d: %d\n", + p->dp->index, ret); if (phy_is_fixed) of_phy_deregister_fixed_link(port_dn); return ret; @@ -1203,6 +1402,8 @@ int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); + netif_device_detach(slave_dev); + if (p->phy) { phy_stop(p->phy); p->old_pause = -1; @@ -1246,7 +1447,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (slave_dev == NULL) return -ENOMEM; - slave_dev->features = master->vlan_features; + slave_dev->features = master->vlan_features | NETIF_F_HW_TC; + slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; @@ -1264,8 +1466,8 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); - p->parent = ds; - p->port = port; + p->dp = &ds->ports[port]; + INIT_LIST_HEAD(&p->mall_tc_list); p->xmit = dst->tag_ops->xmit; p->old_pause = -1; @@ -1298,10 +1500,9 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_switch *ds = p->parent; struct device_node *port_dn; - port_dn = ds->ports[p->port].dn; + port_dn = p->dp->dn; netif_carrier_off(slave_dev); if (p->phy) { @@ -1319,46 +1520,52 @@ static bool dsa_slave_dev_check(struct net_device *dev) return dev->netdev_ops == &dsa_slave_netdev_ops; } -static int dsa_slave_port_upper_event(struct net_device *dev, - unsigned long event, void *ptr) +static int dsa_slave_changeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) { - struct netdev_notifier_changeupper_info *info = ptr; - struct net_device *upper = info->upper_dev; - int err = 0; + int err = NOTIFY_DONE; - switch (event) { - case NETDEV_CHANGEUPPER: - if (netif_is_bridge_master(upper)) { - if (info->linking) - err = dsa_slave_bridge_port_join(dev, upper); - else - dsa_slave_bridge_port_leave(dev); + if (netif_is_bridge_master(info->upper_dev)) { + if (info->linking) { + err = dsa_slave_bridge_port_join(dev, info->upper_dev); + err = notifier_from_errno(err); + } else { + dsa_slave_bridge_port_leave(dev, info->upper_dev); + err = NOTIFY_OK; } - - break; } - return notifier_from_errno(err); + return err; } -static int dsa_slave_port_event(struct net_device *dev, unsigned long event, - void *ptr) +static int dsa_slave_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) { - switch (event) { - case NETDEV_CHANGEUPPER: - return dsa_slave_port_upper_event(dev, event, ptr); - } + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->netdev_ops != &dsa_slave_netdev_ops) + return NOTIFY_DONE; + + if (event == NETDEV_CHANGEUPPER) + return dsa_slave_changeupper(dev, ptr); return NOTIFY_DONE; } -int dsa_slave_netdevice_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static struct notifier_block dsa_slave_nb __read_mostly = { + .notifier_call = dsa_slave_netdevice_event, +}; + +int dsa_slave_register_notifier(void) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); + return register_netdevice_notifier(&dsa_slave_nb); +} - if (dsa_slave_dev_check(dev)) - return dsa_slave_port_event(dev, event, ptr); +void dsa_slave_unregister_notifier(void) +{ + int err; - return NOTIFY_DONE; + err = unregister_netdevice_notifier(&dsa_slave_nb); + if (err) + pr_err("DSA: failed to unregister slave notifier (%d)\n", err); } diff --git a/net/dsa/switch.c b/net/dsa/switch.c new file mode 100644 index 000000000000..6456dacf9ae9 --- /dev/null +++ b/net/dsa/switch.c @@ -0,0 +1,85 @@ +/* + * Handling of a single switch chip, part of a switch fabric + * + * Copyright (c) 2017 Vivien Didelot <vivien.didelot@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <net/dsa.h> + +static int dsa_switch_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_bridge_join) + return ds->ops->port_bridge_join(ds, info->port, info->br); + + if (ds->index != info->sw_index) + dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n", + info->sw_index, info->port, netdev_name(info->br)); + + return 0; +} + +static int dsa_switch_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + if (ds->index == info->sw_index && ds->ops->port_bridge_leave) + ds->ops->port_bridge_leave(ds, info->port, info->br); + + if (ds->index != info->sw_index) + dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n", + info->sw_index, info->port, netdev_name(info->br)); + + return 0; +} + +static int dsa_switch_event(struct notifier_block *nb, + unsigned long event, void *info) +{ + struct dsa_switch *ds = container_of(nb, struct dsa_switch, nb); + int err; + + switch (event) { + case DSA_NOTIFIER_BRIDGE_JOIN: + err = dsa_switch_bridge_join(ds, info); + break; + case DSA_NOTIFIER_BRIDGE_LEAVE: + err = dsa_switch_bridge_leave(ds, info); + break; + default: + err = -EOPNOTSUPP; + break; + } + + /* Non-switchdev operations cannot be rolled back. If a DSA driver + * returns an error during the chained call, switch chips may be in an + * inconsistent state. + */ + if (err) + dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n", + event, err); + + return notifier_from_errno(err); +} + +int dsa_switch_register_notifier(struct dsa_switch *ds) +{ + ds->nb.notifier_call = dsa_switch_event; + + return raw_notifier_chain_register(&ds->dst->nh, &ds->nb); +} + +void dsa_switch_unregister_notifier(struct dsa_switch *ds) +{ + int err; + + err = raw_notifier_chain_unregister(&ds->dst->nh, &ds->nb); + if (err) + dev_err(ds->dev, "failed to unregister notifier (%d)\n", err); +} diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 21bffde6e4bf..5d925b6b2bb1 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -80,9 +80,9 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); brcm_tag[1] = 0; brcm_tag[2] = 0; - if (p->port == 8) + if (p->dp->index == 8) brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; - brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; + brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; return skb; @@ -102,7 +102,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(dst == NULL)) goto out_drop; - ds = dst->ds[0]; + ds = dst->cpu_switch; skb = skb_unshare(skb, GFP_ATOMIC); if (skb == NULL) @@ -121,13 +121,14 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* We should never see a reserved reason code without knowing how to * handle it */ - WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); + if (unlikely(brcm_tag[2] & BRCM_EG_RC_RSVD)) + goto out_drop; /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; /* Validate port against switch setup, either the port is totally */ - if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) + if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) goto out_drop; /* Remove Broadcom tag and update checksum */ diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index bce79ffe342b..72579ceea381 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -33,8 +33,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct tagged FROM_CPU DSA tag from 802.1q tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60 | p->parent->index; - dsa_header[1] = p->port << 3; + dsa_header[0] = 0x60 | p->dp->ds->index; + dsa_header[1] = p->dp->index << 3; /* * Move CFI field from byte 2 to byte 1. @@ -54,8 +54,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct untagged FROM_CPU DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40 | p->parent->index; - dsa_header[1] = p->port << 3; + dsa_header[0] = 0x40 | p->dp->ds->index; + dsa_header[1] = p->dp->index << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; } @@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) + if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) goto out_drop; /* diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 6c1720e88537..648c051817a1 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -42,8 +42,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x60 | p->parent->index; - edsa_header[5] = p->port << 3; + edsa_header[4] = 0x60 | p->dp->ds->index; + edsa_header[5] = p->dp->index << 3; /* * Move CFI field from byte 6 to byte 5. @@ -67,8 +67,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x40 | p->parent->index; - edsa_header[5] = p->port << 3; + edsa_header[4] = 0x40 | p->dp->ds->index; + edsa_header[5] = p->dp->index << 3; edsa_header[6] = 0x00; edsa_header[7] = 0x00; } @@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) + if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) goto out_drop; /* diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 0c90cacee7aa..30240f343aea 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -54,7 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | QCA_HDR_XMIT_FROM_CPU | - BIT(p->port); + BIT(p->dp->index); *phdr = htons(hdr); @@ -104,7 +104,7 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* This protocol doesn't support cascading multiple switches so it's * safe to assume the switch is first in the tree */ - ds = dst->ds[0]; + ds = dst->cpu_switch; if (!ds) goto out_drop; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5e3903eb1afa..26f977176978 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -50,7 +50,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer = skb_put(nskb, 4); trailer[0] = 0x80; - trailer[1] = 1 << p->port; + trailer[1] = 1 << p->dp->index; trailer[2] = 0x10; trailer[3] = 0x00; @@ -67,7 +67,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(dst == NULL)) goto out_drop; - ds = dst->ds[0]; + ds = dst->cpu_switch; skb = skb_unshare(skb, GFP_ATOMIC); if (skb == NULL) @@ -82,7 +82,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, goto out_drop; source_port = trailer[1] & 7; - if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) + if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) goto out_drop; pskb_trim_rcsum(skb, skb->len - 4); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 8c5a479681ca..1446810047f5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -356,6 +356,7 @@ void ether_setup(struct net_device *dev) dev->header_ops = ð_header_ops; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; + dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_DATA_LEN; @@ -392,6 +393,34 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, } EXPORT_SYMBOL(alloc_etherdev_mqs); +static void devm_free_netdev(struct device *dev, void *res) +{ + free_netdev(*(struct net_device **)res); +} + +struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, + unsigned int txqs, unsigned int rxqs) +{ + struct net_device **dr; + struct net_device *netdev; + + dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); + if (!dr) + return NULL; + + netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); + if (!netdev) { + devres_free(dr); + return NULL; + } + + *dr = netdev; + devres_add(dev, dr); + + return netdev; +} +EXPORT_SYMBOL(devm_alloc_etherdev_mqs); + ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); @@ -446,7 +475,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fc65b145f6e7..c73160fb11e7 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -395,7 +395,7 @@ static struct device_type hsr_type = { void hsr_dev_setup(struct net_device *dev) { - random_ether_addr(dev->dev_addr); + eth_hw_addr_random(dev); ether_setup(dev); dev->min_mtu = 0; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index f5b60388d02f..56080da4aa77 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -12,6 +12,7 @@ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> +#include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" @@ -81,7 +82,7 @@ static int hsr_check_dev_ok(struct net_device *dev) return -EINVAL; } - if (dev->priv_flags & IFF_802_1Q_VLAN) { + if (is_vlan_dev(dev)) { netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); return -EINVAL; } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index e0bd013a1e5e..eedba7670b51 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { - pr_debug("size = %Zu, mtu = %u\n", size, mtu); + pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } @@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { - pr_debug("size = %Zu, mtu = %u\n", size, mtu); + pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } diff --git a/net/ife/Kconfig b/net/ife/Kconfig new file mode 100644 index 000000000000..31e48b652c7c --- /dev/null +++ b/net/ife/Kconfig @@ -0,0 +1,16 @@ +# +# IFE subsystem configuration +# + +menuconfig NET_IFE + depends on NET + tristate "Inter-FE based on IETF ForCES InterFE LFB" + default n + help + Say Y here to add support of IFE encapsulation protocol + For details refer to netdev01 paper: + "Distributing Linux Traffic Control Classifier-Action Subsystem" + Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + + To compile this support as a module, choose M here: the module will + be called ife. diff --git a/net/ife/Makefile b/net/ife/Makefile new file mode 100644 index 000000000000..2a90d97746cc --- /dev/null +++ b/net/ife/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the IFE encapsulation protocol +# + +obj-$(CONFIG_NET_IFE) += ife.o diff --git a/net/ife/ife.c b/net/ife/ife.c new file mode 100644 index 000000000000..f360341c72eb --- /dev/null +++ b/net/ife/ife.c @@ -0,0 +1,142 @@ +/* + * net/ife/ife.c - Inter-FE protocol based on ForCES WG InterFE LFB + * Copyright (c) 2015 Jamal Hadi Salim <jhs@mojatatu.com> + * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> + * + * Refer to: draft-ietf-forces-interfelfb-03 and netdev01 paper: + * "Distributing Linux Traffic Control Classifier-Action Subsystem" + * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <linux/etherdevice.h> +#include <net/ife.h> + +struct ifeheadr { + __be16 metalen; + u8 tlv_data[]; +}; + +void *ife_encode(struct sk_buff *skb, u16 metalen) +{ + /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA + * where ORIGDATA = original ethernet header ... + */ + int hdrm = metalen + IFE_METAHDRLEN; + int total_push = hdrm + skb->dev->hard_header_len; + struct ifeheadr *ifehdr; + struct ethhdr *iethh; /* inner ether header */ + int skboff = 0; + int err; + + err = skb_cow_head(skb, total_push); + if (unlikely(err)) + return NULL; + + iethh = (struct ethhdr *) skb->data; + + __skb_push(skb, total_push); + memcpy(skb->data, iethh, skb->dev->hard_header_len); + skb_reset_mac_header(skb); + skboff += skb->dev->hard_header_len; + + /* total metadata length */ + ifehdr = (struct ifeheadr *) (skb->data + skboff); + metalen += IFE_METAHDRLEN; + ifehdr->metalen = htons(metalen); + + return ifehdr->tlv_data; +} +EXPORT_SYMBOL_GPL(ife_encode); + +void *ife_decode(struct sk_buff *skb, u16 *metalen) +{ + struct ifeheadr *ifehdr; + int total_pull; + u16 ifehdrln; + + ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len); + ifehdrln = ntohs(ifehdr->metalen); + total_pull = skb->dev->hard_header_len + ifehdrln; + + if (unlikely(ifehdrln < 2)) + return NULL; + + if (unlikely(!pskb_may_pull(skb, total_pull))) + return NULL; + + skb_set_mac_header(skb, total_pull); + __skb_pull(skb, total_pull); + *metalen = ifehdrln - IFE_METAHDRLEN; + + return &ifehdr->tlv_data; +} +EXPORT_SYMBOL_GPL(ife_decode); + +struct meta_tlvhdr { + __be16 type; + __be16 len; +}; + +/* Caller takes care of presenting data in network order + */ +void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen) +{ + struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata; + + *dlen = ntohs(tlv->len) - NLA_HDRLEN; + *attrtype = ntohs(tlv->type); + + if (totlen) + *totlen = nla_total_size(*dlen); + + return skbdata + sizeof(struct meta_tlvhdr); +} +EXPORT_SYMBOL_GPL(ife_tlv_meta_decode); + +void *ife_tlv_meta_next(void *skbdata) +{ + struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata; + u16 tlvlen = ntohs(tlv->len); + + tlvlen = NLA_ALIGN(tlvlen); + + return skbdata + tlvlen; +} +EXPORT_SYMBOL_GPL(ife_tlv_meta_next); + +/* Caller takes care of presenting data in network order + */ +int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) +{ + __be32 *tlv = (__be32 *) (skbdata); + u16 totlen = nla_total_size(dlen); /*alignment + hdr */ + char *dptr = (char *) tlv + NLA_HDRLEN; + u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN); + + *tlv = htonl(htlv); + memset(dptr, 0, totlen - NLA_HDRLEN); + memcpy(dptr, dval, dlen); + + return totlen; +} +EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); + +MODULE_AUTHOR("Jamal Hadi Salim <jhs@mojatatu.com>"); +MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>"); +MODULE_DESCRIPTION("Inter-FE LFB action"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6e7baaf814c6..91a2557942fa 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate select DST_CACHE + select GRO_CELLS default n config NET_IPGRE @@ -360,6 +361,19 @@ config INET_ESP If unsure, say Y. +config INET_ESP_OFFLOAD + tristate "IP: ESP transformation offload" + depends on INET_ESP + select XFRM_OFFLOAD + default n + ---help--- + Support for ESP transformation offload. This makes sense + only if this system really does IPsec and want to do it + with high throughput. A typical desktop system does not + need it, even if it does IPsec. + + If unsure, say N. + config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 48af58a5686e..c6d4238ff94a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o obj-$(CONFIG_INET_ESP) += esp4.o +obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f75069883f2b..602d40f43687 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && snum < PROT_SOCK && + if (snum && snum < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -570,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) + int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; - if (addr_len < sizeof(uaddr->sa_family)) - return -EINVAL; + /* + * uaddr can be NULL and addr_len can be 0 if: + * sk is a TCP fastopen active socket and + * TCP_FASTOPEN_CONNECT sockopt is set and + * we already have a valid cookie for this socket. + * In this case, user can call write() after connect(). + * write() will invoke tcp_sendmsg_fastopen() which calls + * __inet_stream_connect(). + */ + if (uaddr) { + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; - if (uaddr->sa_family == AF_UNSPEC) { - err = sk->sk_prot->disconnect(sk, flags); - sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - goto out; + if (uaddr->sa_family == AF_UNSPEC) { + err = sk->sk_prot->disconnect(sk, flags); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + goto out; + } } switch (sock->state) { @@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -EISCONN; goto out; case SS_CONNECTING: - err = -EALREADY; + if (inet_sk(sk)->defer_connect) + err = is_sendmsg ? -EINPROGRESS : -EISCONN; + else + err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: @@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTING; + if (!err && inet_sk(sk)->defer_connect) + goto out; + /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. @@ -662,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; lock_sock(sock->sk); - err = __inet_stream_connect(sock, uaddr, addr_len, flags); + err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); return err; } @@ -1406,7 +1423,7 @@ out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } @@ -1700,6 +1717,9 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; +#ifdef CONFIG_SYSCTL + net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; +#endif return 0; } @@ -1831,8 +1851,6 @@ static int __init inet_init(void) ip_init(); - tcp_v4_init(); - /* Setup TCP slab cache for open requests. */ tcp_init(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index f2a71025a770..22377c8ff14b 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err) int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + if (err) + goto out; + work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 89a8cac4726a..51b27ae09fbd 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1263,7 +1263,7 @@ void __init arp_init(void) /* * ax25 -> ASCII conversion */ -static char *ax2asc2(ax25_address *a, char *buf) +static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; @@ -1285,10 +1285,10 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = n + '0'; *s++ = '\0'; - if (*buf == '\0' || *buf == '-') - return "*"; - - return buf; + if (*buf == '\0' || *buf == '-') { + buf[0] = '*'; + buf[1] = '\0'; + } } #endif /* CONFIG_AX25 */ @@ -1322,7 +1322,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, } #endif sprintf(tbuf, "%pI4", n->primary_key); - seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 72d6f056d863..ae206163c273 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1587,6 +1587,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) goto validate_return_locked; } + if (opt_iter + 1 == opt_len) { + err_offset = opt_iter; + goto validate_return_locked; + } tag_len = tag[1]; if (tag_len > (opt_len - opt_iter)) { err_offset = opt_iter + 1; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4cd2ee8857d2..5d367b7ff542 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -65,8 +65,6 @@ #include <net/net_namespace.h> #include <net/addrconf.h> -#include "fib_lookup.h" - static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 20fb25e3027b..b1e24446e297 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include <net/protocol.h> #include <net/udp.h> +#include <linux/highmem.h> + struct esp_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; @@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } +static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +{ + struct esp_output_extra *extra = esp_tmp_extra(tmp); + struct crypto_aead *aead = x->data; + int extralen = 0; + u8 *iv; + struct aead_request *req; + struct scatterlist *sg; + + if (x->props.flags & XFRM_STATE_ESN) + extralen += sizeof(*extra); + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + + /* Unref skb_frag_pages in the src scatterlist if necessary. + * Skip the first sg which comes from skb->data. + */ + if (req->src != req->dst) + for (sg = sg_next(req->src); sg; sg = sg_next(sg)) + put_page(sg_page(sg)); +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + void *tmp; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; - kfree(ESP_SKB_CB(skb)->tmp); + tmp = ESP_SKB_CB(skb)->tmp; + esp_ssg_unref(x, tmp); + kfree(tmp); xfrm_output_resume(skb, err); } @@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb) sizeof(__be32)); } +static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, + struct ip_esp_hdr *esph, + struct esp_output_extra *extra) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + extra->esphoff = (unsigned char *)esph - + skb_transport_header(skb); + esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); + extra->seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + } + + esph->spi = x->id.spi; + + return esph; +} + static void esp_output_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) +{ + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; +} + static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { - int err; struct esp_output_extra *extra; + int err = -ENOMEM; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct aead_request *req; - struct scatterlist *sg; + struct scatterlist *sg, *dsg; struct sk_buff *trailer; + struct page *page; void *tmp; u8 *iv; u8 *tail; + u8 *vaddr; int blksize; int clen; int alen; @@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int nfrags; int assoclen; int extralen; + int tailen; __be64 seqno; + __u8 proto = *skb_mac_header(skb); /* skb is pure payload to encrypt */ @@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); plen = clen - skb->len - tfclen; - - err = skb_cow_data(skb, tfclen + plen + alen, &trailer); - if (err < 0) - goto error; - nfrags = err; - + tailen = tfclen + plen + alen; assoclen = sizeof(*esph); extralen = 0; @@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) assoclen += sizeof(__be32); } - tmp = esp_alloc_tmp(aead, nfrags, extralen); - if (!tmp) { - err = -ENOMEM; - goto error; - } - - extra = esp_tmp_extra(tmp); - iv = esp_tmp_iv(aead, tmp, extralen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); - - /* Fill padding... */ - tail = skb_tail_pointer(trailer); - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = *skb_mac_header(skb); - pskb_put(skb, trailer, clen - skb->len + alen); - - skb_push(skb, -skb_network_offset(skb)); - esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; + esph = ip_esp_hdr(skb); /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) uh = (struct udphdr *)esph; uh->source = sport; uh->dest = dport; - uh->len = htons(skb->len - skb_transport_offset(skb)); + uh->len = htons(skb->len + tailen + - skb_transport_offset(skb)); uh->check = 0; switch (encap_type) { @@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_UDP; } - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + if (!skb_cloned(skb)) { + if (tailen <= skb_availroom(skb)) { + nfrags = 1; + trailer = skb; + tail = skb_tail_pointer(trailer); - aead_request_set_callback(req, 0, esp_output_done, skb); + goto skip_cow; + } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS) + && !skb_has_frag_list(skb)) { + int allocsize; + struct sock *sk = skb->sk; + struct page_frag *pfrag = &x->xfrag; - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * encryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - extra->esphoff = (unsigned char *)esph - - skb_transport_header(skb); - esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); - extra->seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - aead_request_set_callback(req, 0, esp_output_done_esn, skb); + allocsize = ALIGN(tailen, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto cow; + } + + page = pfrag->page; + get_page(page); + + vaddr = kmap_atomic(page); + + tail = vaddr + pfrag->offset; + + esp_output_fill_trailer(tail, tfclen, plen, proto); + + kunmap_atomic(vaddr); + + nfrags = skb_shinfo(skb)->nr_frags; + + __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, + tailen); + skb_shinfo(skb)->nr_frags = ++nfrags; + + pfrag->offset = pfrag->offset + allocsize; + nfrags++; + + skb->len += tailen; + skb->data_len += tailen; + skb->truesize += tailen; + if (sk) + atomic_add(tailen, &sk->sk_wmem_alloc); + + skb_push(skb, -skb_network_offset(skb)); + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; + + tmp = esp_alloc_tmp(aead, nfrags + 2, extralen); + if (!tmp) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = &sg[nfrags]; + + esph = esp_output_set_extra(skb, esph, extra); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + spin_unlock_bh(&x->lock); + + goto skip_cow2; + } } +cow: + err = skb_cow_data(skb, tailen, &trailer); + if (err < 0) + goto error; + nfrags = err; + tail = skb_tail_pointer(trailer); + esph = ip_esp_hdr(skb); + +skip_cow: + esp_output_fill_trailer(tail, tfclen, plen, proto); + + pskb_put(skb, trailer, clen - skb->len + alen); + skb_push(skb, -skb_network_offset(skb)); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); esph->spi = x->id.spi; + tmp = esp_alloc_tmp(aead, nfrags, extralen); + if (!tmp) { + err = -ENOMEM; + goto error; + } + + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = sg; + + esph = esp_output_set_extra(skb, esph, extra); + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, assoclen + ivlen + clen + alen); - aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); +skip_cow2: + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_output_done, skb); + + aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); aead_request_set_ad(req, assoclen); seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + @@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) esp_output_restore_header(skb); } + if (sg != dsg) + esp_ssg_unref(x, tmp); kfree(tmp); error: @@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb) __skb_pull(skb, 4); } +static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + } +} + static void esp_input_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - err = skb_cow_data(skb, 0, &trailer); - if (err < 0) - goto out; - - nfrags = err; - assoclen = sizeof(*esph); seqhilen = 0; @@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } + if (!skb_cloned(skb)) { + if (!skb_is_nonlinear(skb)) { + nfrags = 1; + + goto skip_cow; + } else if (!skb_has_frag_list(skb)) { + nfrags = skb_shinfo(skb)->nr_frags; + nfrags++; + + goto skip_cow; + } + } + + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) + goto out; + + nfrags = err; + +skip_cow: err = -ENOMEM; tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) @@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - skb->ip_summed = CHECKSUM_NONE; + esp_input_set_header(skb, seqhi); - esph = (struct ip_esp_hdr *)skb->data; + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); - aead_request_set_callback(req, 0, esp_input_done, skb); + skb->ip_summed = CHECKSUM_NONE; - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * decryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)skb_push(skb, 4); - *seqhi = esph->spi; - esph->spi = esph->seq_no; - esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + if ((x->props.flags & XFRM_STATE_ESN)) aead_request_set_callback(req, 0, esp_input_done_esn, skb); - } - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + else + aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); aead_request_set_ad(req, assoclen); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c new file mode 100644 index 000000000000..1de442632406 --- /dev/null +++ b/net/ipv4/esp4_offload.c @@ -0,0 +1,106 @@ +/* + * IPV4 GSO/GRO offload support + * Linux INET implementation + * + * Copyright (C) 2016 secunet Security Networks AG + * Author: Steffen Klassert <steffen.klassert@secunet.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * ESP GRO support + */ + +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <crypto/aead.h> +#include <crypto/authenc.h> +#include <linux/err.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <linux/scatterlist.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <net/udp.h> + +static struct sk_buff **esp4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + int offset = skb_gro_offset(skb); + struct xfrm_offload *xo; + struct xfrm_state *x; + __be32 seq; + __be32 spi; + int err; + + skb_pull(skb, offset); + + if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + goto out; + + err = secpath_set(skb); + if (err) + goto out; + + if (skb->sp->len == XFRM_MAX_DEPTH) + goto out; + + x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); + if (!x) + goto out; + + skb->sp->xvec[skb->sp->len++] = x; + skb->sp->olen++; + + xo = xfrm_offload(skb); + if (!xo) { + xfrm_state_put(x); + goto out; + } + xo->flags |= XFRM_GRO; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_SPI_SKB_CB(skb)->seq = seq; + + /* We don't need to handle errors from xfrm_input, it does all + * the error handling and frees the resources on error. */ + xfrm_input(skb, IPPROTO_ESP, spi, -2); + + return ERR_PTR(-EINPROGRESS); +out: + skb_push(skb, offset); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 1; + + return NULL; +} + +static const struct net_offload esp4_offload = { + .callbacks = { + .gro_receive = esp4_gro_receive, + }, +}; + +static int __init esp4_offload_init(void) +{ + return inet_add_offload(&esp4_offload, IPPROTO_ESP); +} + +static void __exit esp4_offload_exit(void) +{ + inet_del_offload(&esp4_offload, IPPROTO_ESP); +} + +module_init(esp4_offload_init); +module_exit(esp4_offload_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3ff8938893ec..42bfd08109dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -46,6 +46,7 @@ #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> +#include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -85,7 +86,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - if (id == RT_TABLE_LOCAL) + if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); @@ -318,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int ret, no_addr; struct fib_result res; struct flowi4 fl4; - struct net *net; + struct net *net = dev_net(dev); bool dev_match; fl4.flowi4_oif = 0; @@ -331,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); no_addr = idev->ifa_list == NULL; @@ -338,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, trace_fib_validate_source(dev, &fl4); - net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && + if (!rpf && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) goto last_resort; fib_combine_itag(itag, &res); @@ -621,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -677,6 +679,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: + err = lwtunnel_valid_encap_type_attr(nla_data(attr), + nla_len(attr)); + if (err < 0) + goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; @@ -691,6 +697,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + if (err < 0) + goto errout; break; } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7a5b4c7d9a87..317026a39cfa 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -471,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { - struct net *net = cfg->fc_nlinfo.nl_net; int ret; change_nexthops(fi) { @@ -503,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_ENCAP); if (nla) { struct lwtunnel_state *lwtstate; - struct net_device *dev = NULL; struct nlattr *nla_entype; nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (!nla_entype) goto err_inval; - if (cfg->fc_oif) - dev = __dev_get_by_index(net, cfg->fc_oif); - ret = lwtunnel_build_state(dev, nla_get_u16( + + ret = lwtunnel_build_state(nla_get_u16( nla_entype), nla, AF_INET, cfg, &lwtstate); @@ -597,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi, #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int fib_encap_match(struct net *net, u16 encap_type, +static int fib_encap_match(u16 encap_type, struct nlattr *encap, - int oif, const struct fib_nh *nh, + const struct fib_nh *nh, const struct fib_config *cfg) { struct lwtunnel_state *lwtstate; - struct net_device *dev = NULL; int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; - if (oif) - dev = __dev_get_by_index(net, oif); - ret = lwtunnel_build_state(dev, encap_type, encap, + ret = lwtunnel_build_state(encap_type, encap, AF_INET, cfg, &lwtstate); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); @@ -623,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { - struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -634,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_oif || cfg->fc_gw) { if (cfg->fc_encap) { - if (fib_encap_match(net, cfg->fc_encap_type, - cfg->fc_encap, cfg->fc_oif, - fi->fib_nh, cfg)) + if (fib_encap_match(cfg->fc_encap_type, + cfg->fc_encap, fi->fib_nh, cfg)) return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && @@ -1093,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; - struct net_device *dev = NULL; if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) goto err_inval; - if (cfg->fc_oif) - dev = __dev_get_by_index(net, cfg->fc_oif); - err = lwtunnel_build_state(dev, cfg->fc_encap_type, + err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, &lwtstate); if (err) @@ -1279,8 +1268,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif - if (fi->fib_nh->nh_lwtstate) - lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); + if (fi->fib_nh->nh_lwtstate && + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0) + goto nla_put_failure; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1316,8 +1306,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif - if (nh->nh_lwtstate) - lwtunnel_fill_encap(skb, nh->nh_lwtstate); + if (nh->nh_lwtstate && + lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); @@ -1363,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) return ret; } +static int call_fib_nh_notifiers(struct fib_nh *fib_nh, + enum fib_event_type event_type) +{ + struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev); + struct fib_nh_notifier_info info = { + .fib_nh = fib_nh, + }; + + switch (event_type) { + case FIB_EVENT_NH_ADD: + if (fib_nh->nh_flags & RTNH_F_DEAD) + break; + if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + fib_nh->nh_flags & RTNH_F_LINKDOWN) + break; + return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, + &info.info); + case FIB_EVENT_NH_DEL: + if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + fib_nh->nh_flags & RTNH_F_LINKDOWN) || + (fib_nh->nh_flags & RTNH_F_DEAD)) + return call_fib_notifiers(dev_net(fib_nh->nh_dev), + event_type, &info.info); + default: + break; + } + + return NOTIFY_DONE; +} + /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host @@ -1404,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; break; } + call_fib_nh_notifiers(nexthop_nh, + FIB_EVENT_NH_DEL); dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1434,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(const struct flowi4 *flp, struct fib_result *res) +static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; @@ -1558,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) continue; alive++; nexthop_nh->nh_flags &= ~nh_flags; + call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) if (alive > 0) { @@ -1618,8 +1643,13 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, int mp_hash) { + bool oif_check; + + oif_check = (fl4->flowi4_oif == 0 || + fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (res->fi->fib_nhs > 1 && oif_check) { if (mp_hash < 0) mp_hash = get_hash_from_flowi4(fl4) >> 1; @@ -1629,7 +1659,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && !fl4->flowi4_oif) + res->type == RTN_UNICAST && oif_check) fib_select_default(fl4, res); if (!fl4->saddr) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2919d1a10cfd..2f0d8233950f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -124,7 +124,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb, static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id, u32 nlflags) + u8 tos, u8 type, u32 tb_id) { struct fib_entry_notifier_info info = { .dst = dst, @@ -133,7 +133,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .tos = tos, .type = type, .tb_id = tb_id, - .nlflags = nlflags, }; return call_fib_notifier(nb, net, event_type, &info.info); } @@ -197,7 +196,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type, static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id, u32 nlflags) + u8 tos, u8 type, u32 tb_id) { struct fib_entry_notifier_info info = { .dst = dst, @@ -206,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net, .tos = tos, .type = type, .tb_id = tb_id, - .nlflags = nlflags, }; return call_fib_notifiers(net, event_type, &info.info); } @@ -1198,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg) { + enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1295,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + key, plen, fi, + new_fa->fa_tos, cfg->fc_type, + tb->tb_id); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, + tb->tb_id, &cfg->fc_nlinfo, nlflags); + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); @@ -1303,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id, cfg->fc_nlflags); - rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, - tb->tb_id, &cfg->fc_nlinfo, nlflags); - goto succeeded; } /* Error if we find a perfect match which @@ -1319,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) + if (cfg->fc_nlflags & NLM_F_APPEND) { + event = FIB_EVENT_ENTRY_APPEND; nlflags |= NLM_F_APPEND; - else + } else { fa = fa_first; + } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1351,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, - cfg->fc_type, tb->tb_id, cfg->fc_nlflags); + call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, + tb->tb_id); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1653,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, cfg->fc_type, - tb->tb_id, 0); + fa_to_delete->fa_info, tos, + fa_to_delete->fa_type, tb->tb_id); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1963,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; - if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { + if (!fi || !(fi->fib_flags & RTNH_F_DEAD) || + tb->tb_id != fa->tb_id) { slen = fa->fa_slen; continue; } @@ -1972,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb) n->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, fa->fa_type, - tb->tb_id, 0); + tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2012,7 +2014,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, call_fib_entry_notifier(nb, net, event_type, l->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id, 0); + fa->fa_type, fa->tb_id); } } @@ -2386,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" - " %Zd bytes, size of tnode: %Zd bytes.\n", + " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0777ea949223..fc310db2708b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net) return *this_cpu_ptr(net->ipv4.icmp_sk); } +/* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmp_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ - local_bh_enable(); return NULL; } return sk; @@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } int sysctl_icmp_msgs_per_sec __read_mostly = 1000; @@ -282,6 +280,33 @@ bool icmp_global_allow(void) } EXPORT_SYMBOL(icmp_global_allow); +static bool icmpv4_mask_allow(struct net *net, int type, int code) +{ + if (type > NR_ICMP_TYPES) + return true; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return true; + + /* Limit if icmp type is enabled in ratemask. */ + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + return true; + + return false; +} + +static bool icmpv4_global_allow(struct net *net, int type, int code) +{ + if (icmpv4_mask_allow(net, type, code)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Send an ICMP frame. */ @@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; + struct inet_peer *peer; bool rc = true; + int vif; - if (type > NR_ICMP_TYPES) - goto out; - - /* Don't limit PMTU discovery. */ - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + if (icmpv4_mask_allow(net, type, code)) goto out; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - /* Limit if icmp type is enabled in ratemask. */ - if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) - goto out; - - rc = false; - if (icmp_global_allow()) { - int vif = l3mdev_master_ifindex(dst->dev); - struct inet_peer *peer; - - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); - rc = inet_peer_xrlim_allow(peer, - net->ipv4.sysctl_icmp_ratelimit); - if (peer) - inet_putpeer(peer); - } + vif = l3mdev_master_ifindex(dst->dev); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + if (peer) + inet_putpeer(peer); out: return rc; } @@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); + int type = icmp_param->data.icmph.type; + int code = icmp_param->data.icmph.code; if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* global icmp_msgs_per_sec */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; + sk = icmp_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; @@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, - icmp_param->data.icmph.code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm *icmp_param; + struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; struct flowi4 fl4; @@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); - if (!icmp_param) - return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; sk = icmp_xmit_lock(net); if (!sk) - goto out_free; + goto out_bh_enable; /* * Construct source address and options. @@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; @@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param->data.icmph.type = type; - icmp_param->data.icmph.code = code; - icmp_param->data.icmph.un.gateway = info; - icmp_param->data.icmph.checksum = 0; - icmp_param->skb = skb_in; - icmp_param->offset = skb_network_offset(skb_in); + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; ipc.addr = iph->saddr; - ipc.opt = &icmp_param->replyopts.opt; + ipc.opt = &icmp_param.replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, - type, code, icmp_param); + type, code, &icmp_param); if (IS_ERR(rt)) goto out_unlock; + /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; @@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param->data_len = skb_in->len - icmp_param->offset; - if (icmp_param->data_len > room) - icmp_param->data_len = room; - icmp_param->head_len = sizeof(struct icmphdr); + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); -out_free: - kfree(icmp_param); +out_bh_enable: + local_bh_enable(); out:; } EXPORT_SYMBOL(icmp_send); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 68d622133f53..44fd86de2823 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = prandom_u32() % in_dev->mr_maxdelay; + unsigned long exp = jiffies + tv + 2; + + if (in_dev->mr_gq_running && + time_after_eq(exp, (in_dev->mr_gq_timer).expires)) + return; in_dev->mr_gq_running = 1; - if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) + if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } @@ -1167,6 +1172,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) psf->sf_crcount = im->crcount; } in_dev_put(pmc->interface); + kfree(pmc); } spin_unlock_bh(&im->lock); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19ea045c50ed..b4d5980ade3b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif +#if IS_ENABLED(CONFIG_IPV6) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) +{ + int addr_type = ipv6_addr_type(sk1_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk1_rcv_saddr == sk2_rcv_saddr) + return 1; + if (!sk1_rcv_saddr || !sk2_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + return 0; +} +#endif + +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) +{ + if (!sk2_ipv6only) { + if (sk1_rcv_saddr == sk2_rcv_saddr) + return 1; + if (!sk1_rcv_saddr || !sk2_rcv_saddr) + return match_wildcard; + } + return 0; +} + +int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk2), + sk->sk_rcv_saddr, + sk2->sk_rcv_saddr, + ipv6_only_sock(sk), + ipv6_only_sock(sk2), + match_wildcard); +#endif + return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, + ipv6_only_sock(sk2), match_wildcard); +} +EXPORT_SYMBOL(inet_rcv_saddr_equal); + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -44,9 +124,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); -int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool reuseport_ok) +static int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, + bool relax, bool reuseport_ok) { struct sock *sk2; bool reuse = sk->sk_reuse; @@ -62,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk, sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && - !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -72,54 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk, rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } } } return sk2 != NULL; } -EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - * We try to allocate an odd port (and leave even ports for connect()) +/* + * Find an open port number for the socket. Returns with the + * inet_bind_hashbucket lock held. */ -int inet_csk_get_port(struct sock *sk, unsigned short snum) +static struct inet_bind_hashbucket * +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) { - bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, attempts = 5, port = snum; - int smallest_size = -1, smallest_port; + int port = 0; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; struct inet_bind_bucket *tb; - kuid_t uid = sock_i_uid(sk); u32 remaining, offset; - bool reuseport_ok = !!snum; - if (port) { -have_port: - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) - goto tb_found; - - goto tb_not_found; - } -again: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -143,8 +202,6 @@ other_half_scan: * We do the opposite to not pollute connect() users. */ offset |= 1U; - smallest_size = -1; - smallest_port = low; /* avoid compiler warning */ other_parity_scan: port = low + offset; @@ -158,30 +215,17 @@ other_parity_scan: spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { - if (((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_port = port; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false, - reuseport_ok)) - goto tb_found; + if (!inet_csk_bind_conflict(sk, tb, false, false)) + goto success; goto next_port; } - goto tb_not_found; + tb = NULL; + goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } - if (smallest_size != -1) { - port = smallest_port; - goto have_port; - } offset--; if (!(offset & 1)) goto other_parity_scan; @@ -191,8 +235,74 @@ next_port: attempt_half = 2; goto other_half_scan; } - return ret; + return NULL; +success: + *port_ret = port; + *tb_ret = tb; + return head; +} +static inline int sk_reuseport_match(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + + if (tb->fastreuseport <= 0) + return 0; + if (!sk->sk_reuseport) + return 0; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + return 0; + if (!uid_eq(tb->fastuid, uid)) + return 0; + /* We only need to check the rcv_saddr if this tb was once marked + * without fastreuseport and then was reset, as we can only know that + * the fast_*rcv_saddr doesn't have any conflicts with the socks on the + * owners list. + */ + if (tb->fastreuseport == FASTREUSEPORT_ANY) + return 1; +#if IS_ENABLED(CONFIG_IPV6) + if (tb->fast_sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, + &sk->sk_v6_rcv_saddr, + tb->fast_rcv_saddr, + sk->sk_rcv_saddr, + tb->fast_ipv6_only, + ipv6_only_sock(sk), true); +#endif + return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, + ipv6_only_sock(sk), true); +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) + */ +int inet_csk_get_port(struct sock *sk, unsigned short snum) +{ + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, port = snum; + struct inet_bind_hashbucket *head; + struct net *net = sock_net(sk); + struct inet_bind_bucket *tb = NULL; + kuid_t uid = sock_i_uid(sk); + + if (!port) { + head = inet_csk_find_open_port(sk, &tb, &port); + if (!head) + return ret; + if (!tb) + goto tb_not_found; + goto success; + } + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port); @@ -203,39 +313,54 @@ tb_found: if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - !rcu_access_pointer(sk->sk_reuseport_cb) && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) + if ((tb->fastreuse > 0 && reuse) || + sk_reuseport_match(tb, sk)) goto success; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true, - reuseport_ok)) { - if ((reuse || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - !snum && smallest_size != -1 && --attempts >= 0) { - spin_unlock_bh(&head->lock); - goto again; - } + if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; + } +success: + if (!hlist_empty(&tb->owners)) { + tb->fastreuse = reuse; + if (sk->sk_reuseport) { + tb->fastreuseport = FASTREUSEPORT_ANY; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } else { + tb->fastreuseport = 0; } + } else { if (!reuse) tb->fastreuse = 0; - if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) - tb->fastreuseport = 0; - } else { - tb->fastreuse = reuse; if (sk->sk_reuseport) { - tb->fastreuseport = 1; - tb->fastuid = uid; + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } } else { tb->fastreuseport = 0; } } -success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); @@ -711,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - local_bh_disable(); percpu_counter_dec(sk->sk_prot->orphan_count); - local_bh_enable(); + sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4dea33e5f295..3828b3a805cd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ca97835bfec4..8bea74298173 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; - tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } @@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); - tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; } @@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); - tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); @@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, - struct inet_listen_hashbucket *ilb, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) + struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; struct sock *sk2; @@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk, sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && - saddr_same(sk, sk2, false)) + inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2); } @@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return 0; } -int __inet_hash(struct sock *sk, struct sock *osk, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) +int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; @@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk, spin_lock(&ilb->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb, saddr_same); + err = inet_reuseport_add_sock(sk, ilb); if (err) goto unlock; } @@ -503,7 +494,7 @@ int inet_hash(struct sock *sk) if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); + err = __inet_hash(sk, NULL); local_bh_enable(); } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ddcd56c08d14..f8aff2c71cde 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, - struct inet_timewait_death_row *twdr, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { struct inet_timewait_sock *tw; struct sock *sk; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index fac275c48108..737ce826d7ec 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (!IS_ERR(neigh)) { - int res = dst_neigh_output(dst, neigh, skb); + int res; + + sock_confirm_neigh(skb, neigh); + res = neigh_output(neigh, skb); rcu_read_unlock_bh(); return res; @@ -886,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->csum = 0; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + __skb_queue_tail(queue, skb); } else if (skb_is_gso(skb)) { goto append; @@ -1086,6 +1092,9 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + if ((flags & MSG_CONFIRM) && !skb_prev) + skb_set_dst_pending_confirm(skb, 1); + /* * Put the packet on the pending queue. */ @@ -1629,6 +1638,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 57e1405e8282..ebd953bc5607 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -116,10 +116,10 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_COMPLETE) return; - if (offset != 0) - csum = csum_sub(csum, - csum_partial(skb_transport_header(skb) + tlen, - offset, 0)); + if (offset != 0) { + int tend_off = skb_transport_offset(skb) + tlen; + csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0)); + } put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } @@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, continue; switch (cmsg->cmsg_type) { case IP_RETOPTS: - err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), @@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct ip_mreqn mreq; struct net_device *dev = NULL; + int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EADDRNOTAVAIL; if (!dev) break; + + midx = l3mdev_master_ifindex(dev); + dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && - mreq.imr_ifindex != sk->sk_bound_dev_if) + mreq.imr_ifindex != sk->sk_bound_dev_if && + (!midx || midx != sk->sk_bound_dev_if)) break; inet->mc_index = mreq.imr_ifindex; @@ -1225,14 +1230,27 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) * which has interface index (iif) as the first member of the * underlying inet{6}_skb_parm struct. This code then overlays * PKTINFO_SKB_CB and in_pktinfo also has iif as the first - * element so the iif is picked up from the prior IPCB + * element so the iif is picked up from the prior IPCB. If iif + * is the loopback interface, then return the sending interface + * (e.g., process binds socket to eth0 for Tx which is + * redirected to loopback in the rtable/dst). */ + if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) + pktinfo->ipi_ifindex = inet_iif(skb); + pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } - skb_dst_drop(skb); + /* We need to keep the dst for __ip_options_echo() + * We could restrict the test to opt.ts_needtime || opt.srr, + * but the following is good enough as IP options are not often used. + */ + if (unlikely(IPCB(skb)->opt.optlen)) + skb_dst_force(skb); + else + skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index fed3d29f9eb3..a31f47ccaad9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb, EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); /* Often modified stats are per cpu, other are shared (netdev->stats) */ -struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) +void ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) { int i; @@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->rx_bytes += rx_bytes; tot->tx_bytes += tx_bytes; } - - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); @@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; -static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, +static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -313,6 +311,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, + .owner = THIS_MODULE, }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { @@ -324,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; -static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, +static int ip6_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -403,6 +402,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { .fill_encap = ip6_tun_fill_encap_info, .get_encap_size = ip6_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, + .owner = THIS_MODULE, }; void __init ip_tunnel_core_init(void) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index efc1e76d4977..c0317c940bcd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net) } #endif +static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct mfc_cache_cmp_arg *cmparg = arg->key; + struct mfc_cache *c = (struct mfc_cache *)ptr; + + return cmparg->mfc_mcastgrp != c->mfc_mcastgrp || + cmparg->mfc_origin != c->mfc_origin; +} + +static const struct rhashtable_params ipmr_rht_params = { + .head_offset = offsetof(struct mfc_cache, mnode), + .key_offset = offsetof(struct mfc_cache, cmparg), + .key_len = sizeof(struct mfc_cache_cmp_arg), + .nelem_hint = 3, + .locks_mul = 1, + .obj_cmpfn = ipmr_hash_cmp, + .automatic_shrinking = true, +}; + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; - unsigned int i; /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (id != RT_TABLE_DEFAULT && id >= 1000000000) @@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) write_pnet(&mrt->net, net); mrt->id = id; - /* Forwarding cache */ - for (i = 0; i < MFC_LINES; i++) - INIT_LIST_HEAD(&mrt->mfc_cache_array[i]); - + rhltable_init(&mrt->mfc_hash, &ipmr_rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, @@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, true); + rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { - int line = MFC_HASH(mcastgrp, origin); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = origin + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) - return c; - } + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + return c; + return NULL; } @@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, int vifi) { - int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = htonl(INADDR_ANY), + .mfc_origin = htonl(INADDR_ANY) + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) - if (c->mfc_origin == htonl(INADDR_ANY) && - c->mfc_mcastgrp == htonl(INADDR_ANY) && - c->mfc_un.res.ttls[vifi] < 255) + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[vifi] < 255) return c; return NULL; @@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, __be32 mcastgrp, int vifi) { - int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = htonl(INADDR_ANY) + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c, *proxy; if (mcastgrp == htonl(INADDR_ANY)) goto skip; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) - if (c->mfc_origin == htonl(INADDR_ANY) && - c->mfc_mcastgrp == mcastgrp) { - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - - /* It's ok if the vifi is part of the static tree */ - proxy = ipmr_cache_find_any_parent(mrt, - c->mfc_parent); - if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) - return c; - } + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } skip: return ipmr_cache_find_any_parent(mrt, vifi); } +/* Look for a (S,G,iif) entry if parent != -1 */ +static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, + __be32 origin, __be32 mcastgrp, + int parent) +{ + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = origin, + }; + struct rhlist_head *tmp, *list; + struct mfc_cache *c; + + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mfc_parent) + return c; + + return NULL; +} + /* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { @@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt, static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) { + const struct iphdr *iph = ip_hdr(skb); + struct mfc_cache *c; bool found = false; int err; - struct mfc_cache *c; - const struct iphdr *iph = ip_hdr(skb); spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, list) { @@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { - int line; - struct mfc_cache *c, *next; + struct mfc_cache *c; - line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, + mfc->mfcc_mcastgrp.s_addr, parent); + rcu_read_unlock(); + if (!c) + return -ENOENT; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); + ipmr_cache_free(c); - list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && - (parent == -1 || parent == c->mfc_parent)) { - list_del_rcu(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); - return 0; - } - } - return -ENOENT; + return 0; } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { - bool found = false; - int line; struct mfc_cache *uc, *c; + bool found; + int ret; if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - - list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && - (parent == -1 || parent == c->mfc_parent)) { - found = true; - break; - } - } - - if (found) { + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, + mfc->mfcc_mcastgrp.s_addr, parent); + rcu_read_unlock(); + if (c) { write_lock_bh(&mrt_lock); c->mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); @@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); - + ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + ipmr_rht_params); + if (ret) { + pr_err("ipmr: rhtable insert error %d\n", ret); + ipmr_cache_free(c); + return ret; + } + list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ @@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { - int i; + struct mfc_cache *c, *tmp; LIST_HEAD(list); - struct mfc_cache *c, *next; + int i; /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { @@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) unregister_netdevice_many(&list); /* Wipe the cache */ - for (i = 0; i < MFC_LINES; i++) { - list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - list_del_rcu(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); - } + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (!all && (c->mfc_flags & MFC_STATIC)) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); + ipmr_cache_free(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_destroy_unres(mrt, c); @@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local) { + int true_vifi = ipmr_find_vif(mrt, skb->dev); int psend = -1; int vif, ct; - int true_vifi = ipmr_find_vif(mrt, skb->dev); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; @@ -2091,8 +2136,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mfc_parent >= MAXVIFS) + if (c->mfc_parent >= MAXVIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; + } if (VIF_EXISTS(mrt, c->mfc_parent) && nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) @@ -2134,7 +2181,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait, u32 portid) + struct rtmsg *rtm, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; @@ -2158,11 +2205,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, struct net_device *dev; int vif = -1; - if (nowait) { - rcu_read_unlock(); - return -EAGAIN; - } - dev = skb->dev; read_lock(&mrt_lock); if (dev) @@ -2296,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) struct mr_table *mrt; struct mfc_cache *mfc; unsigned int t = 0, s_t; - unsigned int h = 0, s_h; unsigned int e = 0, s_e; s_t = cb->args[0]; - s_h = cb->args[1]; - s_e = cb->args[2]; + s_e = cb->args[1]; rcu_read_lock(); ipmr_for_each_table(mrt, net) { if (t < s_t) goto next_table; - if (t > s_t) - s_h = 0; - for (h = s_h; h < MFC_LINES; h++) { - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { - if (e < s_e) - goto next_entry; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) + goto done; next_entry: - e++; - } - e = s_e = 0; + e++; } + e = 0; + s_e = 0; + spin_lock_bh(&mfc_unres_lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) @@ -2340,16 +2378,15 @@ next_entry2: e++; } spin_unlock_bh(&mfc_unres_lock); - e = s_e = 0; - s_h = 0; + e = 0; + s_e = 0; next_table: t++; } done: rcu_read_unlock(); - cb->args[2] = e; - cb->args[1] = h; + cb->args[1] = e; cb->args[0] = t; return skb->len; @@ -2559,7 +2596,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, - "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, @@ -2593,10 +2630,8 @@ struct ipmr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; - int ct; }; - static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { @@ -2604,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct mfc_cache *mfc; rcu_read_lock(); - for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { - it->cache = &mrt->mfc_cache_array[it->ct]; - list_for_each_entry_rcu(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - } + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + if (pos-- == 0) + return mfc; rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); @@ -2636,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) it->mrt = mrt; it->cache = NULL; - it->ct = 0; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct mfc_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = it->mrt; + struct mfc_cache *mfc = v; ++*pos; @@ -2659,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; - BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]); - - while (++it->ct < MFC_LINES) { - it->cache = &mrt->mfc_cache_array[it->ct]; - if (list_empty(it->cache)) - continue; - return list_first_entry(it->cache, struct mfc_cache, list); - } - /* exhausted cache_array, show unresolved */ rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; - it->ct = 0; spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) @@ -2691,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_array[it->ct]) + else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a467e1236c43..6241a81fd7f5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -677,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size, return PTR_ERR(counters); loc_cpu_entry = private->entries; - /* ... then copy entire thing ... */ - if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { - ret = -EFAULT; - goto free_counters; - } /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ @@ -689,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size, const struct xt_entry_target *t; e = (struct arpt_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off, e, sizeof(*e))) { + ret = -EFAULT; + goto free_counters; + } if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -698,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size, } t = arpt_get_target_c(e); - if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct xt_entry_target, - u.user.name), - t->u.kernel.target->name, - strlen(t->u.kernel.target->name)+1) != 0) { + if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 91656a1d8fbd..384b85713e06 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -826,10 +826,6 @@ copy_entries_to_user(unsigned int total_size, return PTR_ERR(counters); loc_cpu_entry = private->entries; - if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { - ret = -EFAULT; - goto free_counters; - } /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ @@ -839,6 +835,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_target *t; e = (struct ipt_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off, e, sizeof(*e))) { + ret = -EFAULT; + goto free_counters; + } if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -852,23 +852,14 @@ copy_entries_to_user(unsigned int total_size, i += m->u.match_size) { m = (void *)e + i; - if (copy_to_user(userptr + off + i - + offsetof(struct xt_entry_match, - u.user.name), - m->u.kernel.match->name, - strlen(m->u.kernel.match->name)+1) - != 0) { + if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ipt_get_target_c(e); - if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct xt_entry_target, - u.user.name), - t->u.kernel.target->name, - strlen(t->u.kernel.target->name)+1) != 0) { + if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 21db00d0362b..52f26459efc3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,6 +144,11 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { +#ifdef CONFIG_PROC_FS + if (!c->pde) + c = NULL; + else +#endif if (unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; else if (entry) @@ -166,14 +171,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c, static struct clusterip_config * clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, - struct net_device *dev) + struct net_device *dev) { + struct net *net = dev_net(dev); struct clusterip_config *c; - struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); + struct clusterip_net *cn = net_generic(net, clusterip_net_id); c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) - return NULL; + return ERR_PTR(-ENOMEM); c->dev = dev; c->clusterip = ip; @@ -185,6 +191,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, atomic_set(&c->refcount, 1); atomic_set(&c->entries, 1); + spin_lock_bh(&cn->lock); + if (__clusterip_config_find(net, ip)) { + spin_unlock_bh(&cn->lock); + kfree(c); + + return ERR_PTR(-EBUSY); + } + + list_add_rcu(&c->list, &cn->configs); + spin_unlock_bh(&cn->lock); + #ifdef CONFIG_PROC_FS { char buffer[16]; @@ -195,16 +212,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { + spin_lock_bh(&cn->lock); + list_del_rcu(&c->list); + spin_unlock_bh(&cn->lock); kfree(c); - return NULL; + + return ERR_PTR(-ENOMEM); } } #endif - spin_lock_bh(&cn->lock); - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - return c; } @@ -410,9 +427,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) config = clusterip_config_init(cipinfo, e->ip.dst.s_addr, dev); - if (!config) { + if (IS_ERR(config)) { dev_put(dev); - return -ENOMEM; + return PTR_ERR(config); } dev_mc_add(config->dev, config->clustermac); } @@ -468,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = { .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), + .usersize = offsetof(struct ipt_clusterip_tgt_info, config), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), #endif /* CONFIG_COMPAT */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 30c0de53e254..3240a2614e82 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net, goto free_nskb; if (nfct) { - nskb->nfct = nfct; - nskb->nfctinfo = ctinfo; + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } @@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net, synproxy_build_options(nth, opts); - synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void @@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net, synproxy_build_options(nth, opts); - synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index f273098e48fd..37fb9552e858 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, return dev_match || flags & XT_RPFILTER_LOOSE; } -static bool rpfilter_is_local(const struct sk_buff *skb) +static bool +rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { - const struct rtable *rt = skb_rtable(skb); - return rt && (rt->rt_flags & RTCF_LOCAL); + return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -79,7 +79,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; - if (rpfilter_is_local(skb)) + if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fcfd071f4705..bc1486f2c064 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -235,7 +235,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) } if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n", + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", *len, sizeof(struct sockaddr_in)); return -EINVAL; } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index d075b3cf2400..73c591d8a9a8 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - enum ip_conntrack_info *ctinfo, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; + enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; - NF_CT_ASSERT(skb->nfct == NULL); + NF_CT_ASSERT(!skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ @@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return -NF_ACCEPT; } - *ctinfo = IP_CT_RELATED; + ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { @@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, } if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; + ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ - skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; - skb->nfctinfo = *ctinfo; + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); return NF_ACCEPT; } @@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, static int icmp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) + u8 pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; @@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); + return icmp_error_message(net, tmpl, skb, hooknum); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 49bd6a54404f..346bf7ccac08 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) { + if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); @@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, #if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif #endif diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index cf986e1c7bbd..f0dbff05fc28 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); + nf_reset(skb); + nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); + nf_conntrack_get(skb_nfct(skb)); #endif /* * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index b24795e2ee6d..2f3895ddc275 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m, ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); if (ap == NULL) { - nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", + nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", skb->len - sizeof(_arph)); return; } @@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf, struct nf_log_buf *m; /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) return; m = nf_log_buf_open(); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 856648966f4c..c83a9963269b 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, struct nf_log_buf *m; /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) return; m = nf_log_buf_open(); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index fd8220213afc..146d86105183 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); + nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); + skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 965b1a161369..2981291910dd 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -26,13 +26,6 @@ static __be32 get_saddr(__be32 addr) return addr; } -static bool fib4_is_local(const struct sk_buff *skb) -{ - const struct rtable *rt = skb_rtable(skb); - - return rt && (rt->rt_flags & RTCF_LOCAL); -} - #define DSCP_BITS 0xfc void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, @@ -95,8 +88,10 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) { - nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && + nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { + nft_fib_store_result(dest, priv->result, pkt, + nft_in(pkt)->ifindex); return; } @@ -131,7 +126,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, switch (res.type) { case RTN_UNICAST: break; - case RTN_LOCAL: /* should not appear here, see fib4_is_local() above */ + case RTN_LOCAL: /* Should not see RTN_LOCAL here */ return; default: break; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 86cca610f4c2..2af6244b83e2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -433,9 +433,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; } - pr_debug("after bind(): num = %d, dif = %d\n", - (int)isk->inet_num, - (int)sk->sk_bound_dev_if); + pr_debug("after bind(): num = %hu, dif = %d\n", + isk->inet_num, + sk->sk_bound_dev_if); err = 0; if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) @@ -642,6 +642,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); + if (!skb) + return 0; pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); @@ -848,7 +850,8 @@ out: return err; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 7143ca1a6af9..69cf49e8356d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) unsigned int frag_mem; int orphans, sockets; - local_bh_disable(); orphans = percpu_counter_sum_positive(&tcp_orphan_count); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); - local_bh_enable(); socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - atomic_read(&tcp_death_row.tw_count), sockets, + atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), @@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), + SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4e49e5cb001c..8119e1f66e03 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) @@ -666,7 +669,8 @@ out: return len; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a82a11747b3f..8471dd116771 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); +static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, @@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = { .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, + .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } +static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + + rt = (const struct rtable *)dst; + if (rt->rt_gateway) + pkey = (const __be32 *)&rt->rt_gateway; + else if (!daddr || + (rt->rt_flags & + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + return; + + __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); +} + #define IP_IDENTS_SZ 2048u static atomic_t *ip_idents __read_mostly; @@ -1758,7 +1777,6 @@ standard_hash: static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { @@ -1858,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) @@ -1883,7 +1902,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -1914,7 +1933,8 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, + rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, + flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1989,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, { int res; + tos &= IPTOS_RT_MASK; rcu_read_lock(); /* Multicast recognition logic is moved from route cache to here. @@ -2453,7 +2474,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, int event, int nowait, unsigned int flags) + u32 seq, int event) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2462,7 +2483,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0); if (!nlh) return -EMSGSIZE; @@ -2471,7 +2492,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = table_id; + r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; @@ -2540,18 +2561,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, - r, nowait, portid); + r, portid); if (err <= 0) { - if (!nowait) { - if (err == 0) - return 0; - goto nla_put_failure; - } else { - if (err == -EMSGSIZE) - goto nla_put_failure; - error = err; - } + if (err == 0) + return 0; + goto nla_put_failure; } } else #endif @@ -2637,9 +2652,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); - local_bh_enable(); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) @@ -2664,7 +2677,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); + RTM_NEWROUTE); if (err < 0) goto errout_free; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3e88467d70ee..496b97e17aaf 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -13,13 +13,13 @@ #include <linux/tcp.h> #include <linux/slab.h> #include <linux/random.h> -#include <linux/cryptohash.h> +#include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/tcp.h> #include <net/route.h> -static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; +static siphash_key_t syncookie_secret[2] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); - static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp; - net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); - - tmp = this_cpu_ptr(ipv4_cookie_scratch); - memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); - tmp[0] = (__force u32)saddr; - tmp[1] = (__force u32)daddr; - tmp[2] = ((__force u32)sport << 16) + (__force u32)dport; - tmp[3] = count; - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); - - return tmp[17]; + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + count, &syncookie_secret[c]); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 80bc36b25de2..d6880a6149ee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int ip_privileged_port_min; +static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; @@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { - if (range[1] < range[0]) + /* Ensure that the upper limit is not smaller than the lower, + * and that the lower does not encroach upon the privileged + * port limit. + */ + if ((range[1] < range[0]) || + (range[0] < net->ipv4.sysctl_ip_prot_sock)) ret = -EINVAL; else set_local_port_range(net, range); @@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, return ret; } +/* Validate changes from /proc interface. */ +static int ipv4_privileged_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_ip_prot_sock); + int ret; + int pports; + int range[2]; + struct ctl_table tmp = { + .data = &pports, + .maxlen = sizeof(pports), + .mode = table->mode, + .extra1 = &ip_privileged_port_min, + .extra2 = &ip_privileged_port_max, + }; + + pports = net->ipv4.sysctl_ip_prot_sock; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + inet_get_local_port_range(net, &range[0], &range[1]); + /* Ensure that the local port range doesn't overlap with the + * privileged port range. + */ + if (range[0] < pports) + ret = -EINVAL; + else + net->ipv4.sysctl_ip_prot_sock = pports; + } + + return ret; +} static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { @@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_max_tw_buckets", - .data = &tcp_death_row.sysctl_max_tw_buckets, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_fastopen_key, }, { - .procname = "tcp_tw_recycle", - .data = &tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_max_syn_backlog", - .data = &sysctl_max_syn_backlog, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -433,13 +453,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &tcp_adv_win_scale_max, }, { - .procname = "tcp_tw_reuse", - .data = &sysctl_tcp_tw_reuse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_frto", .data = &sysctl_tcp_frto, .maxlen = sizeof(int), @@ -565,13 +578,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_thin_dupack", - .data = &sysctl_tcp_thin_dupack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_early_retrans", .data = &sysctl_tcp_early_retrans, .maxlen = sizeof(int), @@ -958,7 +964,35 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, + }, + { + .procname = "tcp_tw_reuse", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_tw_buckets", + .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_tw_recycle", + .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_syn_backlog", + .data = &init_net.ipv4.sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { @@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &one, }, #endif + { + .procname = "ip_unprivileged_port_start", + .maxlen = sizeof(int), + .data = &init_net.ipv4.sysctl_ip_prot_sock, + .mode = 0644, + .proc_handler = ipv4_privileged_ports, + }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "udp_l3mdev_accept", + .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4a044964da66..da385ae997a3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; - tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -421,15 +420,13 @@ void tcp_init_sock(struct sock *sk) sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - local_bh_disable(); sk_sockets_allocated_inc(sk); - local_bh_enable(); } EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (tsflags) { + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -536,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; + } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* Active TCP fastopen socket with defer_connect + * Return POLLOUT so application can call write() + * in order for kernel to generate SYN+data + */ + mask |= POLLOUT | POLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); @@ -770,6 +773,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ret = -EAGAIN; break; } + /* if __tcp_splice_read() got nothing while we have + * an skb in receive queue, we do not want to loop. + * This might happen with URG data. + */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); @@ -958,10 +967,8 @@ new_segment: copied += copy; offset += copy; size -= copy; - if (!size) { - tcp_tx_timestamp(sk, sk->sk_tsflags, skb); + if (!size) goto out; - } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -987,8 +994,11 @@ wait_for_memory: } out: - if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + if (copied) { + tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + if (!(flags & MSG_SENDPAGE_NOTLAST)) + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + } return copied; do_error: @@ -1073,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *inet = inet_sk(sk); int err, flags; if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) @@ -1087,9 +1098,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, tp->fastopen_req->data = msg; tp->fastopen_req->size = size; + if (inet->defer_connect) { + err = tcp_connect(sk); + /* Same failure procedure as in tcp_v4/6_connect */ + if (err) { + tcp_set_state(sk, TCP_CLOSE); + inet->inet_dport = 0; + sk->sk_route_caps = 0; + } + } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, msg->msg_name, - msg->msg_namelen, flags); + msg->msg_namelen, flags, 1); + inet->defer_connect = 0; *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); return err; @@ -1109,7 +1130,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) lock_sock(sk); flags = msg->msg_flags; - if (flags & MSG_FASTOPEN) { + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) goto out; @@ -1267,7 +1288,7 @@ new_segment: } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); - get_page(pfrag->page); + page_ref_inc(pfrag->page); } pfrag->offset += copy; } @@ -1281,7 +1302,6 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, sockc.tsflags, skb); if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; @@ -1312,8 +1332,10 @@ wait_for_memory: } out: - if (copied) + if (copied) { + tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + } out_nopush: release_sock(sk); return copied + copied_syn; @@ -2473,11 +2495,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else { - tp->thin_dupack = val; - if (tp->thin_dupack) - tcp_disable_early_retrans(tp); - } break; case TCP_REPAIR: @@ -2662,6 +2679,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; } break; + case TCP_FASTOPEN_CONNECT: + if (val > 1 || val < 0) { + err = -EINVAL; + } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + if (sk->sk_state == TCP_CLOSE) + tp->fastopen_connect = val; + else + err = -EINVAL; + } else { + err = -EOPNOTSUPP; + } + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -2764,6 +2793,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_sacked = sk->sk_max_ack_backlog; return; } + + slow = lock_sock_fast(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2814,15 +2846,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; - slow = lock_sock_fast(sk); - info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); - unlock_sock_fast(sk, slow); - info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; @@ -2838,6 +2866,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) do_div(rate64, intv); info->tcpi_delivery_rate = rate64; } + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2847,7 +2876,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) struct sk_buff *stats; struct tcp_info info; - stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); if (!stats) return NULL; @@ -2858,6 +2887,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, + tp->data_segs_out, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, + tp->total_retrans, TCP_NLA_PAD); return stats; } @@ -2967,8 +3000,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; + case TCP_THIN_DUPACK: - val = tp->thin_dupack; + val = 0; break; case TCP_REPAIR: @@ -3021,6 +3055,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; + case TCP_FASTOPEN_CONNECT: + val = tp->fastopen_connect; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; @@ -3334,6 +3372,7 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); + inet_hashinfo_init(&tcp_hashinfo); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -3377,10 +3416,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; - - tcp_death_row.sysctl_max_tw_buckets = cnt / 2; sysctl_tcp_max_orphans = cnt / 2; - sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ @@ -3399,6 +3435,7 @@ void __init tcp_init(void) pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4e777a3243f9..8ea4e9787f82 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, struct tcp_fastopen_cookie tmp; if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { - struct in6_addr *buf = (struct in6_addr *) tmp.val; + struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) @@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash @@ -325,3 +326,57 @@ fastopen: *foc = valid_foc; return NULL; } + +bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, + struct tcp_fastopen_cookie *cookie) +{ + unsigned long last_syn_loss = 0; + int syn_loss = 0; + + tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); + + /* Recurring FO SYN losses: no cookie or data in SYN */ + if (syn_loss > 1 && + time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { + cookie->len = -1; + return false; + } + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + cookie->len = -1; + return true; + } + return cookie->len > 0; +} + +/* This function checks if we want to defer sending SYN until the first + * write(). We defer under the following conditions: + * 1. fastopen_connect sockopt is set + * 2. we have a valid cookie + * Return value: return true if we want to defer until application writes data + * return false if we want to send out SYN immediately + */ +bool tcp_fastopen_defer_connect(struct sock *sk, int *err) +{ + struct tcp_fastopen_cookie cookie = { .len = 0 }; + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (tp->fastopen_connect && !tp->fastopen_req) { + if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { + inet_sk(sk)->defer_connect = 1; + return true; + } + + /* Alloc fastopen_req in order for FO option to be included + * in SYN + */ + tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), + sk->sk_allocation); + if (tp->fastopen_req) + tp->fastopen_req->cookie = cookie; + else + *err = -ENOBUFS; + } + return false; +} +EXPORT_SYMBOL(tcp_fastopen_defer_connect); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6c790754ae3e..39c393cc0fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; -int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tcp_disable_fack(tp); } - if (metric > 0) - tcp_disable_early_retrans(tp); tp->rack.reord = 1; } @@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; - - if (!tp->lost_out || - after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } /* Sum the number of packets on the wire we have marked as lost. @@ -1135,6 +1126,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, xmit_time, sacked); + tcp_rack_advance(tp, sacked, end_seq, + xmit_time, &state->ack_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; - bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous - * loss recovery is underway except recurring timeout(s) on - * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO + * if a previous recovery is underway, otherwise it may incorrectly + * call a timeout spurious if some previously retransmitted packets + * are s/acked (sec 3.2). We do not apply that retriction since + * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS + * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO + * on PTMU discovery to avoid sending new data. */ - tp->frto = sysctl_tcp_frto && - (new_recovery || icsk->icsk_retransmits) && - !inet_csk(sk)->icsk_mtup.probe_size; + tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned long delay; - - /* Delay early retransmit and entering fast recovery for - * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples - * available, or RTO is scheduled to fire first. - */ - if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt_us) - return false; - - delay = max(usecs_to_jiffies(tp->srtt_us >> 5), - msecs_to_jiffies(2)); - - if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) - return false; - - inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, - TCP_RTO_MAX); - return true; -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK (Disabled by default. Subsumbed by RACK): + * It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out; - int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#4: It is still not OK... But will it be useful to delay - * recovery more? - */ - packets_out = tp->packets_out; - if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && - !tcp_may_send_now(sk)) { - /* We have nothing to send. This connection is limited - * either by receiver window or by application. - */ - return true; - } - - /* If a thin stream is detected, retransmit after first - * received dupack. Employ only if SACK is supported in order - * to avoid possible corner-case series of spurious retransmissions - * Use only if there are no unsent data. - */ - if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && - tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && - tcp_is_sack(tp) && !tcp_send_head(sk)) - return true; - - /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious - * retransmissions due to small network reorderings, we implement - * Mitigation A.3 in the RFC and delay the retransmission for a short - * interval if appropriate. - */ - if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); - return false; } @@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, - int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; @@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, true)) - return; + /* The ACK (s)acks some never-retransmitted data meaning not all + * the data packets before the timeout were lost. Therefore we + * undo the congestion window and state. This is essentially + * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since + * a retransmitted skb is permantly marked, we can apply such an + * operation even if F-RTO was not used. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, tp->undo_marker)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ @@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, + const struct skb_mstamp *ack_time) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + u32 prior_retrans = tp->retrans_out; + + tcp_rack_mark_lost(sk, ack_time); + if (prior_retrans > tp->retrans_out) + *ack_flag |= FLAG_LOST_RETRANS; + } +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit) + bool is_dupack, int *ack_flag, int *rexmit, + const struct skb_mstamp *ack_time) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } - /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) { - flag |= FLAG_LOST_RETRANS; - *ack_flag |= FLAG_LOST_RETRANS; - } - /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } + tcp_rack_identify_loss(sk, ack_flag, ack_time); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - if (icsk->icsk_ca_state != TCP_CA_Open && - !(flag & FLAG_LOST_RETRANS)) + tcp_rack_identify_loss(sk, ack_flag, ack_time); + if (!(icsk->icsk_ca_state == TCP_CA_Open || + (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ default: @@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); + tcp_rack_identify_loss(sk, ack_flag, ack_time); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = @@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk) } } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_rearm_rto(sk); - - /* Stop if ER is disabled after the delayed ER timer is scheduled */ - if (!tp->do_early_retrans) - return; - - tcp_enter_recovery(sk, false); - tcp_update_scoreboard(sk, 1); - tcp_xmit_retransmit_queue(sk); -} - /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack, - struct skb_mstamp *now) + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; + struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } else if (tcp_is_sack(tp)) { tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + tcp_rack_advance(tp, sacked, scb->end_seq, + &skb->skb_mstamp, + &sack->ack_time); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - struct skb_mstamp now; sack_state.first_sackt.v64 = 0; sack_state.rate = &rs; @@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&now); + skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) { @@ -3689,34 +3634,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state, &now); + &sack_state); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { - struct dst_entry *dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - } + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) + sk_dst_confirm(sk); if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag, &rs); + tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, + sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3737,9 +3682,11 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { + skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); tcp_xmit_recovery(sk, rexmit); } @@ -4557,6 +4504,7 @@ add_sack: end: if (skb) { tcp_grow_window(sk, skb); + skb_condense(skb); skb_set_owner_r(skb, sk); } } @@ -5078,7 +5026,7 @@ static void tcp_check_space(struct sock *sk) if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); /* pairs with tcp_poll() */ - smp_mb__after_atomic(); + smp_mb(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); @@ -5249,6 +5197,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) return err; } +/* Accept RST for rcv_nxt - 1 after a FIN. + * When tcp connections are abruptly terminated from Mac OSX (via ^C), a + * FIN is sent followed by a RST packet. The RST is sent with the same + * sequence number as the FIN, and thus according to RFC 5961 a challenge + * ACK should be sent. However, Mac OSX rate limits replies to challenge + * ACKs on the closed socket. In addition middleboxes can drop either the + * challenge ACK or a subsequent RST. + */ +static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && + (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | + TCPF_CLOSING)); +} + /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5287,20 +5252,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); + } else if (tcp_reset_check(sk, skb)) { + tcp_reset(sk); } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - /* RFC 5961 3.2 (extend to match against SACK too if available): - * If seq num matches RCV.NXT or the right-most SACK block, + /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a + * FIN and SACK too if available): + * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or + * the right-most SACK block, * then * RESET the connection * else * Send a challenge ACK */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || + tcp_reset_check(sk, skb)) { rst_seq_match = true; } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { struct tcp_sack_block *sp = &tp->selective_acks[0]; @@ -5916,9 +5886,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->syn) { if (th->fin) goto discard; - if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) - return 1; + /* It is possible that we process SYN packets from backlog, + * so we need to make sure to disable BH right there. + */ + local_bh_disable(); + acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + if (!acceptable) + return 1; consume_skb(skb); return 0; } @@ -6022,7 +5998,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; case TCP_FIN_WAIT1: { - struct dst_entry *dst; int tmo; /* If we enter the TCP_FIN_WAIT1 state and we are a @@ -6049,9 +6024,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); + sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ @@ -6363,7 +6336,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tcp_death_row.sysctl_tw_recycle) { + if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); @@ -6377,8 +6350,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } /* Kill the following clause, if you dislike this way. */ else if (!net->ipv4.sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30d81f533ada..9a89b8deafae 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -84,7 +84,6 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> -int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; #ifdef CONFIG_TCP_MD5SIG @@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && + (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) @@ -146,7 +145,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct flowi4 *fl4; struct rtable *rt; int err; + u32 seq; struct ip_options_rcu *inet_opt; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -197,7 +198,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) tcp_fetch_timewait_stamp(sk, &rt->dst); @@ -216,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = inet_hash_connect(&tcp_death_row, sk); + err = inet_hash_connect(tcp_death_row, sk); if (err) goto failure; @@ -232,19 +233,27 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); + rt = NULL; - if (!tp->write_seq && likely(!tp->repair)) - tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port, - &tp->tsoffset); + if (likely(!tp->repair)) { + seq = secure_tcp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port, + &tp->tsoffset); + if (!tp->write_seq) + tp->write_seq = seq; + } inet->inet_id = tp->write_seq ^ jiffies; + if (tcp_fastopen_defer_connect(sk, &err)) + return err; + if (err) + goto failure; + err = tcp_connect(sk); - rt = NULL; if (err) goto failure; @@ -1319,10 +1328,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric_advmss(dst); - if (tcp_sk(sk)->rx_opt.user_mss && - tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) - newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -1556,8 +1562,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) * It has been noticed pure SACK packets were sometimes dropped * (if cooked by drivers without copybreak feature). */ - if (!skb->data_len) - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + skb_condense(skb); if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); @@ -1817,7 +1822,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), - .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, @@ -1888,9 +1892,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_free_fastopen_req(tp); tcp_saved_syn_free(tp); - local_bh_disable(); sk_sockets_allocated_dec(sk); - local_bh_enable(); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2229,7 +2231,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; @@ -2376,6 +2378,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, @@ -2419,7 +2422,7 @@ static void __net_exit tcp_sk_exit(struct net *net) static int __net_init tcp_sk_init(struct net *net) { - int res, cpu; + int res, cpu, cnt; net->ipv4.tcp_sk = alloc_percpu(struct sock *); if (!net->ipv4.tcp_sk) @@ -2456,6 +2459,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + net->ipv4.sysctl_tcp_tw_reuse = 0; + + cnt = tcp_hashinfo.ehash_mask + 1; + net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; + net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; + + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); return 0; fail: @@ -2466,7 +2477,7 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); + inet_twsk_purge(&tcp_hashinfo, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { @@ -2477,7 +2488,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { void __init tcp_v4_init(void) { - inet_hashinfo_init(&tcp_hashinfo); if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d46f4d5b1c62..0f46e5fe31ad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk) u32 val; int m; + sk_dst_confirm(sk); if (sysctl_tcp_nometrics_save || !dst) return; - if (dst->flags & DST_HOST) - dst_confirm(dst); - rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? @@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk) struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ + sk_dst_confirm(sk); if (!dst) goto reset; - dst_confirm(dst); - rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (!tm) { @@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk) val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) { tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); tp->reordering = val; } @@ -606,7 +602,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, return ret; } -EXPORT_SYMBOL_GPL(tcp_peer_is_proven); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 28ce5ee831f5..7e16243cdb58 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -29,12 +29,6 @@ int sysctl_tcp_abort_on_overflow __read_mostly; -struct inet_timewait_death_row tcp_death_row = { - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &tcp_hashinfo, -}; -EXPORT_SYMBOL_GPL(tcp_death_row); - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -100,13 +94,15 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; + if (tmp_opt.rcv_tsecr) + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); @@ -153,7 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) inet_twsk_reschedule(tw, tw->tw_timeout); @@ -264,11 +260,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; bool recycle_ok = false; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - tw = inet_twsk_alloc(sk, &tcp_death_row, state); + tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -364,15 +361,12 @@ void tcp_openreq_init_rwin(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); - u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); int full_space = tcp_full_space(sk_listener); - int mss = dst_metric_advmss(dst); u32 window_clamp; __u8 rcv_wscale; + int mss; - if (user_mss && user_mss < mss) - mss = user_mss; - + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); @@ -472,7 +466,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; newtp->lsndtime = treq->snt_synack.stamp_jiffies; newsk->sk_txhash = treq->txhash; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d5331a1b1dc..22548b5f05cb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -76,16 +76,15 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); } -/* SND.NXT, if window was not shrunk. +/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one + * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already @@ -95,7 +94,9 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - if (!before(tcp_wnd_end(tp), tp->snd_nxt)) + if (!before(tcp_wnd_end(tp), tp->snd_nxt) || + (tp->rx_opt.wscale_ok && + ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); @@ -966,6 +967,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + /* If we had to use memory reserve to allocate this skb, + * this might cause drops if packet is looped back : + * Other socket might not have SOCK_MEMALLOC. + * Packets not looped back do not care about pfmemalloc. + */ + skb->pfmemalloc = 0; + skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -975,6 +983,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); + skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); + /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; @@ -2289,8 +2299,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) - return false; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { tcp_rearm_rto(sk); @@ -2309,8 +2317,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || - !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + !tp->packets_out || !tcp_is_sack(tp) || + icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && @@ -2518,9 +2527,11 @@ u32 __tcp_select_window(struct sock *sk) int full_space = min_t(int, tp->window_clamp, allowed_space); int window; - if (mss > full_space) + if (unlikely(mss > full_space)) { mss = full_space; - + if (mss <= 0) + return 0; + } if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; @@ -2774,6 +2785,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); + /* Update global and local TCP statistics. */ + segs = tcp_skb_pcount(skb); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans += segs; + /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. @@ -2791,14 +2809,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } if (likely(!err)) { - segs = tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - /* Update global TCP statistics. */ - TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans += segs; + } else if (err != -EBUSY) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; } @@ -2821,8 +2834,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - } else if (err != -EBUSY) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } if (tp->undo_retrans < 0) @@ -2831,36 +2842,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return err; } -/* Check if we forward retransmits are possible in the current - * window/congestion state. - */ -static bool tcp_can_forward_retransmit(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); - - /* Forward retransmissions are possible only during Recovery. */ - if (icsk->icsk_ca_state != TCP_CA_Recovery) - return false; - - /* No forward retransmissions in Reno are possible. */ - if (tcp_is_reno(tp)) - return false; - - /* Yeah, we have to make difficult choice between forward transmission - * and retransmission... Both ways have their merits... - * - * For now we do not retransmit anything, while we have some new - * segments to send. In the other cases, follow rule 3 for - * NextSeg() specified in RFC3517. - */ - - if (tcp_may_send_now(sk)) - return false; - - return true; -} - /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2875,24 +2856,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; - u32 max_segs, last_lost; + u32 max_segs; int mib_idx; - int fwd_rexmitting = 0; if (!tp->packets_out) return; - if (!tp->lost_out) - tp->retransmit_high = tp->snd_una; - if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; - last_lost = TCP_SKB_CB(skb)->end_seq; - if (after(last_lost, tp->retransmit_high)) - last_lost = tp->retransmit_high; } else { skb = tcp_write_queue_head(sk); - last_lost = tp->snd_una; } max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); @@ -2915,31 +2888,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ segs = min_t(int, segs, max_segs); - if (fwd_rexmitting) { -begin_fwd: - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - break; - mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - - } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { - tp->retransmit_high = last_lost; - if (!tcp_can_forward_retransmit(sk)) - break; - /* Backtrack if necessary to non-L'ed skb */ - if (hole) { - skb = hole; - hole = NULL; - } - fwd_rexmitting = 1; - goto begin_fwd; - + if (tp->retrans_out >= tp->lost_out) { + break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { - last_lost = TCP_SKB_CB(skb)->end_seq; if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else @@ -2960,7 +2916,8 @@ begin_fwd: if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk)) + if (skb == tcp_write_queue_head(sk) && + icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); @@ -3117,7 +3074,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; - u16 user_mss; int mss; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); @@ -3147,10 +3103,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, } skb_dst_set(skb, dst); - mss = dst_metric_advmss(dst); - user_mss = READ_ONCE(tp->rx_opt.user_mss); - if (user_mss && user_mss < mss) - mss = user_mss; + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -3256,9 +3209,7 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); - tp->advmss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) - tp->advmss = tp->rx_opt.user_mss; + tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3324,31 +3275,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0; - unsigned long last_syn_loss = 0; + int space, err = 0; struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ - tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, - &syn_loss, &last_syn_loss); - /* Recurring FO SYN losses: revert to regular handshake temporarily */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - fo->cookie.len = -1; - goto fallback; - } - - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) - fo->cookie.len = -1; - else if (fo->cookie.len <= 0) + if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) - tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index e36df4fcfeba..4ecb38ae8504 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,9 +1,32 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; -/* Marks a packet lost, if some packet sent later has been (s)acked. +static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); + } +} + +static bool tcp_rack_sent_after(const struct skb_mstamp *t1, + const struct skb_mstamp *t2, + u32 seq1, u32 seq2) +{ + return skb_mstamp_after(t1, t2) || + (t1->v64 == t2->v64 && after(seq1, seq2)); +} + +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ -int tcp_rack_mark_lost(struct sock *sk) +static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, + u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - u32 reo_wnd, prior_retrans = tp->retrans_out; - - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) - return 0; - - /* Reset the advanced flag to avoid unnecessary queue scanning */ - tp->rack.advanced = 0; + u32 reo_wnd; + *reo_timeout = 0; /* To be more reordering resilient, allow min_rtt/4 settling delay * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * RTT because reordering is often a path property and less related * to queuing or delayed ACKs. - * - * TODO: measure and adapt to the observed reordering delay, and - * use a timer to retransmit like the delayed early retransmit. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) { + /* Step 3 in draft-cheng-tcpm-rack-00.txt: + * A packet is lost if its elapsed time is beyond + * the recent RTT plus the reordering window. + */ + u32 elapsed = skb_mstamp_us_delta(now, + &skb->skb_mstamp); + s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - if (skb_mstamp_us_delta(&tp->rack.mstamp, - &skb->skb_mstamp) <= reo_wnd) + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); continue; - - /* skb is lost if packet sent later is sacked */ - tcp_skb_mark_lost_uncond_verify(tp, skb); - if (scb->sacked & TCPCB_SACKED_RETRANS) { - scb->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); } + + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; + + /* Record maximum wait time (+1 to avoid 0) */ + *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early * b/c the rest are all sent after rack_sent @@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk) break; } } - return prior_retrans - tp->retrans_out; } -/* Record the most recently (re)sent time among the (s)acked packets */ -void tcp_rack_advance(struct tcp_sock *tp, - const struct skb_mstamp *xmit_time, u8 sacked) +void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout; + + if (!tp->rack.advanced) + return; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + tcp_rack_detect_loss(sk, now, &timeout); + if (timeout) { + timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, + timeout, inet_csk(sk)->icsk_rto); + } +} + +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + const struct skb_mstamp *xmit_time, + const struct skb_mstamp *ack_time) { + u32 rtt_us; + if (tp->rack.mstamp.v64 && - !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + end_seq, tp->rack.end_seq)) return; + rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); if (sacked & TCPCB_RETRANS) { - struct skb_mstamp now; - /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - skb_mstamp_get(&now); - if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + if (rtt_us < tcp_min_rtt(tp)) return; } - + tp->rack.rtt_us = rtt_us; tp->rack.mstamp = *xmit_time; + tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } + +/* We have waited long enough to accommodate reordering. Mark the expired + * packets lost and retransmit them. + */ +void tcp_rack_reo_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + u32 timeout, prior_inflight; + + skb_mstamp_get(&now); + prior_inflight = tcp_packets_in_flight(tp); + tcp_rack_detect_loss(sk, &now, &timeout); + if (prior_inflight != tcp_packets_in_flight(tp)) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { + tcp_enter_recovery(sk, false); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_cwnd_reduction(sk, 1, 0); + } + tcp_xmit_retransmit_queue(sk); + } + if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) + tcp_rearm_rto(sk); +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3705075f42c3..40d893556e67 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk) event = icsk->icsk_pending; switch (event) { - case ICSK_TIME_EARLY_RETRANS: - tcp_resume_early_retransmit(sk); + case ICSK_TIME_REO_TIMEOUT: + tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); @@ -617,6 +617,7 @@ void tcp_set_keepalive(struct sock *sk, int val) else if (!val) inet_csk_delete_keepalive_timer(sk); } +EXPORT_SYMBOL_GPL(tcp_set_keepalive); static void tcp_keepalive_timer (unsigned long data) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307a7c2e544..ea6e4cff9faf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) +/* IPCB reference means this can not be used from early demux */ +static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_udp_l3mdev_accept && + skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) + return true; +#endif + return false; +} + static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard), - unsigned int log) + struct sock *sk, unsigned int log) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); @@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!sk2->sk_reuseport || !sk->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2, true)) { - if (!bitmap) - return 1; - __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); + inet_rcv_saddr_equal(sk, sk2, true)) { + if (sk2->sk_reuseport && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(uid, sock_i_uid(sk2))) { + if (!bitmap) + return 0; + } else { + if (!bitmap) + return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, + bitmap); + } } } return 0; @@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) + struct sock *sk) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); @@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!sk2->sk_reuseport || !sk->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2, true)) { - res = 1; + inet_rcv_saddr_equal(sk, sk2, true)) { + if (sk2->sk_reuseport && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(uid, sock_i_uid(sk2))) { + res = 0; + } else { + res = 1; + } break; } } @@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } -static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); @@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && - (*saddr_same)(sk, sk2, false)) { + inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2); } } @@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, * * @sk: socket struct in question * @snum: port number to look up - * @saddr_comp: AF-dependent comparison of bound local IP addresses * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, - saddr_comp, udptable->log); + udptable->log); snum = first; /* @@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); + cond_resched(); } while (++first != last); goto fail; } else { @@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, if (hslot->count < hslot2->count) goto scan_primary_hash; - exist = udp_lib_lport_inuse2(net, snum, hslot2, - sk, saddr_comp); + exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); if (!exist && (hash2_nulladdr != slot2)) { hslot2 = udp_hashslot2(udptable, hash2_nulladdr); exist = udp_lib_lport_inuse2(net, snum, hslot2, - sk, saddr_comp); + sk); } if (exist) goto fail_unlock; @@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, goto found; } scan_primary_hash: - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, - saddr_comp, 0)) + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0)) goto fail_unlock; } found: @@ -324,7 +328,7 @@ found: udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { if (sk->sk_reuseport && - udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + udp_reuseport_add_sock(sk, hslot)) { inet_sk(sk)->inet_num = 0; udp_sk(sk)->udp_port_hash = 0; udp_sk(sk)->udp_portaddr_hash ^= snum; @@ -356,24 +360,6 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses - * match_wildcard == false: addresses must be exactly the same, i.e. - * 0.0.0.0 only equals to 0.0.0.0 - */ -int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) -{ - struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - - if (!ipv6_only_sock(sk2)) { - if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) - return 1; - if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) - return match_wildcard; - } - return 0; -} - static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) { @@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); + return udp_lib_get_port(sk, snum, hash2_nulladdr); } static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum, int dif) + __be32 daddr, unsigned short hnum, int dif, + bool exact_dif) { int score; struct inet_sock *inet; @@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned int hnum, int dif, + __be32 daddr, unsigned int hnum, int dif, bool exact_dif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + bool exact_dif = udp_lib_exact_dif_match(net, skb); int score, badness, matches = 0, reuseport = 0; u32 hash = 0; @@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); @@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, skb); } return result; } @@ -533,7 +521,7 @@ begin: badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -1113,7 +1101,8 @@ out: return err; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; @@ -1501,7 +1490,7 @@ try_again: return err; csum_copy_err: - if (!__sk_queue_drop_skb(sk, skb, flags)) { + if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 62e1e72db461..1fc684111ce6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -40,6 +40,7 @@ drop: int xfrm4_transport_finish(struct sk_buff *skb, int async) { + struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; @@ -53,6 +54,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); + if (xo && (xo->flags & XFRM_GRO)) { + skb_mac_header_rebuild(skb); + return 0; + } + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index fd840c7d75ea..4acc0508c5eb 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -43,6 +43,7 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); + struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -50,7 +51,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - skb_reset_transport_header(skb); + if (!xo || !(xo->flags & XFRM_GRO)) + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6a7ff6957535..71b4ecc195c7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,8 +17,6 @@ #include <net/ip.h> #include <net/l3mdev.h> -static struct xfrm_policy_afinfo xfrm4_policy_afinfo; - static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, @@ -219,7 +217,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); - xfrm4_policy_afinfo.garbage_collect(net); + xfrm_garbage_collect_deferred(net); return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } @@ -271,8 +269,7 @@ static struct dst_ops xfrm4_dst_ops_template = { .gc_thresh = INT_MAX, }; -static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { - .family = AF_INET, +static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, @@ -376,7 +373,7 @@ static struct pernet_operations __net_initdata xfrm4_net_ops = { static void __init xfrm4_policy_init(void) { - xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET); } void __init xfrm4_init(void) diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index dccefa9d84cf..8dd0e6ab8606 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -188,9 +188,8 @@ static const struct net_protocol ipcomp4_protocol = { .netns_ok = 1, }; -static struct xfrm_input_afinfo xfrm4_input_afinfo = { +static const struct xfrm_input_afinfo xfrm4_input_afinfo = { .family = AF_INET, - .owner = THIS_MODULE, .callback = xfrm4_rcv_cb, }; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 542074c00c78..d6660a8c0ea5 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -90,11 +90,3 @@ void __init xfrm4_state_init(void) { xfrm_state_register_afinfo(&xfrm4_state_afinfo); } - -#if 0 -void __exit xfrm4_state_fini(void) -{ - xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); -} -#endif /* 0 */ - diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index ec1267e2bd1f..e2afe677a9d9 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -75,6 +75,19 @@ config INET6_ESP If unsure, say Y. +config INET6_ESP_OFFLOAD + tristate "IPv6: ESP transformation offload" + depends on INET6_ESP + select XFRM_OFFLOAD + default n + ---help--- + Support for ESP transformation offload. This makes sense + only if this system really does IPsec and want to do it + with high throughput. A typical desktop system does not + need it, even if it does IPsec. + + If unsure, say N. + config INET6_IPCOMP tristate "IPv6: IPComp transformation" select INET6_XFRM_TUNNEL @@ -208,6 +221,7 @@ config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL select DST_CACHE + select GRO_CELLS ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a9e9fec387ce..217e9ff0e24b 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -30,6 +30,7 @@ ipv6-objs += $(ipv6-y) obj-$(CONFIG_INET6_AH) += ah6.o obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1e124bc8e1e..cfc485a8e1c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -243,6 +243,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .seg6_require_hmac = 0, #endif .enhanced_dad = 1, + .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -294,6 +295,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .seg6_require_hmac = 0, #endif .enhanced_dad = 1, + .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, }; /* Check if a valid qdisc is available */ @@ -386,9 +388,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); if (ndev->cnf.stable_secret.initialized) - ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; else - ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; + ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode; ndev->cnf.mtu6 = dev->mtu; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -2144,12 +2146,14 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) case ARPHRD_SIT: return addrconf_ifid_sit(eui, dev); case ARPHRD_IPGRE: + case ARPHRD_TUNNEL: return addrconf_ifid_gre(eui, dev); case ARPHRD_6LOWPAN: return addrconf_ifid_eui64(eui, dev); case ARPHRD_IEEE1394: return addrconf_ifid_ieee1394(eui, dev); case ARPHRD_TUNNEL6: + case ARPHRD_IP6GRE: return addrconf_ifid_ip6tnl(eui, dev); } return -1; @@ -2387,8 +2391,8 @@ static void manage_tempaddrs(struct inet6_dev *idev, static bool is_addr_mode_generate_stable(struct inet6_dev *idev) { - return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || - idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; + return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, @@ -3152,7 +3156,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - switch (idev->addr_gen_mode) { + switch (idev->cnf.addr_gen_mode) { case IN6_ADDR_GEN_MODE_RANDOM: ipv6_gen_mode_random_init(idev); /* fallthrough */ @@ -3193,6 +3197,9 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_IP6GRE) && + (dev->type != ARPHRD_IPGRE) && + (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE)) { /* Alas, we support only Ethernet autoconfiguration. */ return; @@ -3204,8 +3211,8 @@ static void addrconf_dev_config(struct net_device *dev) /* this device type has no EUI support */ if (dev->type == ARPHRD_NONE && - idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) - idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) + idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; addrconf_addr_gen(idev, false); } @@ -3386,9 +3393,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } if (idev) { - if (idev->if_flags & IF_READY) - /* device is already configured. */ + if (idev->if_flags & IF_READY) { + /* device is already configured - + * but resend MLD reports, we might + * have roamed and need to update + * multicast snooping switches + */ + ipv6_mc_up(idev); break; + } idev->if_flags |= IF_READY; } @@ -4009,6 +4022,12 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) if (bump_id) rt_genid_bump_ipv6(dev_net(dev)); + + /* Make sure that a new temporary address will be created + * before this temporary address becomes deprecated. + */ + if (ifp->flags & IFA_F_TEMPORARY) + addrconf_verify_rtnl(); } static void addrconf_dad_run(struct inet6_dev *idev) @@ -4888,6 +4907,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) struct net *net = dev_net(ifa->idev->dev); int err = -ENOBUFS; + /* Don't send DELADDR notification for TENTATIVE address, + * since NEWADDR notification is sent only after removing + * TENTATIVE flag. + */ + if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + return; + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; @@ -4975,6 +5001,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; #endif array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; + array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; } static inline size_t inet6_ifla6_size(void) @@ -5086,7 +5113,7 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, if (!nla) goto nla_put_failure; - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) goto nla_put_failure; read_lock_bh(&idev->lock); @@ -5204,6 +5231,26 @@ static int inet6_validate_link_af(const struct net_device *dev, return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy); } +static int check_addr_gen_mode(int mode) +{ + if (mode != IN6_ADDR_GEN_MODE_EUI64 && + mode != IN6_ADDR_GEN_MODE_NONE && + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + mode != IN6_ADDR_GEN_MODE_RANDOM) + return -EINVAL; + return 1; +} + +static int check_stable_privacy(struct inet6_dev *idev, struct net *net, + int mode) +{ + if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !idev->cnf.stable_secret.initialized && + !net->ipv6.devconf_dflt->stable_secret.initialized) + return -EINVAL; + return 1; +} + static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) { int err = -EINVAL; @@ -5225,18 +5272,11 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (tb[IFLA_INET6_ADDR_GEN_MODE]) { u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); - if (mode != IN6_ADDR_GEN_MODE_EUI64 && - mode != IN6_ADDR_GEN_MODE_NONE && - mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY && - mode != IN6_ADDR_GEN_MODE_RANDOM) + if (check_addr_gen_mode(mode) < 0 || + check_stable_privacy(idev, dev_net(dev), mode) < 0) return -EINVAL; - if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && - !idev->cnf.stable_secret.initialized && - !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized) - return -EINVAL; - - idev->addr_gen_mode = mode; + idev->cnf.addr_gen_mode = mode; err = 0; } @@ -5540,8 +5580,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); @@ -5550,7 +5589,6 @@ static void addrconf_disable_change(struct net *net, __s32 newf) dev_disable_change(idev); } } - rcu_read_unlock(); } static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) @@ -5645,6 +5683,55 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, return ret; } +static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = 0; + int new_val; + struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; + struct net *net = (struct net *)ctl->extra2; + + if (!rtnl_trylock()) + return restart_syscall(); + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write) { + new_val = *((int *)ctl->data); + + if (check_addr_gen_mode(new_val) < 0) { + ret = -EINVAL; + goto out; + } + + /* request for default */ + if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { + ipv6_devconf_dflt.addr_gen_mode = new_val; + + /* request for individual net device */ + } else { + if (!idev) + goto out; + + if (check_stable_privacy(idev, net, new_val) < 0) { + ret = -EINVAL; + goto out; + } + + if (idev->cnf.addr_gen_mode != new_val) { + idev->cnf.addr_gen_mode = new_val; + addrconf_dev_config(idev->dev); + } + } + } + +out: + rtnl_unlock(); + + return ret; +} + static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -5695,14 +5782,14 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct inet6_dev *idev = __in6_dev_get(dev); if (idev) { - idev->addr_gen_mode = + idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; } } } else { struct inet6_dev *idev = ctl->extra1; - idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; } out: @@ -6090,6 +6177,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "addr_gen_mode", + .data = &ipv6_devconf.addr_gen_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_addr_gen_mode, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index aa42123bc301..04db40620ea6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -302,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) + if (snum && snum < inet_prot_sock(net) && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; lock_sock(sk); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 189eb10b742d..dda6035e3b84 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -474,6 +474,9 @@ static void ah6_input_done(struct crypto_async_request *base, int err) int hdr_len = skb_network_header_len(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + if (err) + goto out; + work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a3eaafd87100..eec27f87efac 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,18 +167,22 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, if (np->sndflow) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type == IPV6_ADDR_ANY) { + if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ - usin->sin6_addr.s6_addr[15] = 0x01; + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; } + addr_type = ipv6_addr_type(&usin->sin6_addr); + daddr = &usin->sin6_addr; - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (__ipv6_only_sock(sk)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index cbcdd5db31f4..ff54faa75631 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -44,6 +44,8 @@ #include <net/protocol.h> #include <linux/icmpv6.h> +#include <linux/highmem.h> + struct esp_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; @@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } +static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +{ + __be32 *seqhi; + struct crypto_aead *aead = x->data; + int seqhilen = 0; + u8 *iv; + struct aead_request *req; + struct scatterlist *sg; + + if (x->props.flags & XFRM_STATE_ESN) + seqhilen += sizeof(__be32); + + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + + /* Unref skb_frag_pages in the src scatterlist if necessary. + * Skip the first sg which comes from skb->data. + */ + if (req->src != req->dst) + for (sg = sg_next(req->src); sg; sg = sg_next(sg)) + put_page(sg_page(sg)); +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + void *tmp; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; - kfree(ESP_SKB_CB(skb)->tmp); + tmp = ESP_SKB_CB(skb)->tmp; + esp_ssg_unref(x, tmp); + kfree(tmp); xfrm_output_resume(skb, err); } @@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb) esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); } +static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, + struct ip_esp_hdr *esph, + __be32 *seqhi) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + } + + esph->spi = x->id.spi; + + return esph; +} + static void esp_output_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -146,14 +198,31 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) +{ + /* Fill padding... */ + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = proto; +} + static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct aead_request *req; - struct scatterlist *sg; + struct scatterlist *sg, *dsg; struct sk_buff *trailer; + struct page *page; void *tmp; int blksize; int clen; @@ -164,10 +233,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) int nfrags; int assoclen; int seqhilen; + int tailen; u8 *iv; u8 *tail; + u8 *vaddr; __be32 *seqhi; __be64 seqno; + __u8 proto = *skb_mac_header(skb); /* skb is pure payload to encrypt */ aead = x->data; @@ -186,11 +258,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); plen = clen - skb->len - tfclen; - - err = skb_cow_data(skb, tfclen + plen + alen, &trailer); - if (err < 0) - goto error; - nfrags = err; + tailen = tfclen + plen + alen; assoclen = sizeof(*esph); seqhilen = 0; @@ -200,59 +268,152 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags, seqhilen); - if (!tmp) { - err = -ENOMEM; - goto error; + *skb_mac_header(skb) = IPPROTO_ESP; + esph = ip_esp_hdr(skb); + + if (!skb_cloned(skb)) { + if (tailen <= skb_availroom(skb)) { + nfrags = 1; + trailer = skb; + tail = skb_tail_pointer(trailer); + + goto skip_cow; + } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS) + && !skb_has_frag_list(skb)) { + int allocsize; + struct sock *sk = skb->sk; + struct page_frag *pfrag = &x->xfrag; + + allocsize = ALIGN(tailen, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto cow; + } + + page = pfrag->page; + get_page(page); + + vaddr = kmap_atomic(page); + + tail = vaddr + pfrag->offset; + + esp_output_fill_trailer(tail, tfclen, plen, proto); + + kunmap_atomic(vaddr); + + nfrags = skb_shinfo(skb)->nr_frags; + + __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, + tailen); + skb_shinfo(skb)->nr_frags = ++nfrags; + + pfrag->offset = pfrag->offset + allocsize; + nfrags++; + + skb->len += tailen; + skb->data_len += tailen; + skb->truesize += tailen; + if (sk) + atomic_add(tailen, &sk->sk_wmem_alloc); + + skb_push(skb, -skb_network_offset(skb)); + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; + + tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen); + if (!tmp) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = &sg[nfrags]; + + esph = esp_output_set_esn(skb, esph, seqhi); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + err = -ENOMEM; + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); + + spin_unlock_bh(&x->lock); + + goto skip_cow2; + } } - seqhi = esp_tmp_seqhi(tmp); - iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); +cow: + err = skb_cow_data(skb, tailen, &trailer); + if (err < 0) + goto error; + nfrags = err; - /* Fill padding... */ tail = skb_tail_pointer(trailer); - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = *skb_mac_header(skb); - pskb_put(skb, trailer, clen - skb->len + alen); + esph = ip_esp_hdr(skb); +skip_cow: + esp_output_fill_trailer(tail, tfclen, plen, proto); + + pskb_put(skb, trailer, clen - skb->len + alen); skb_push(skb, -skb_network_offset(skb)); - esph = ip_esp_hdr(skb); - *skb_mac_header(skb) = IPPROTO_ESP; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esph->spi = x->id.spi; - aead_request_set_callback(req, 0, esp_output_done, skb); - - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * encryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); - *seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - aead_request_set_callback(req, 0, esp_output_done_esn, skb); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); + if (!tmp) { + err = -ENOMEM; + goto error; } - esph->spi = x->id.spi; + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); + dsg = sg; + + esph = esp_output_set_esn(skb, esph, seqhi); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, assoclen + ivlen + clen + alen); - aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); +skip_cow2: + if ((x->props.flags & XFRM_STATE_ESN)) + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + else + aead_request_set_callback(req, 0, esp_output_done, skb); + + aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); aead_request_set_ad(req, assoclen); seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + @@ -278,6 +439,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esp_output_restore_header(skb); } + if (sg != dsg) + esp_ssg_unref(x, tmp); kfree(tmp); error: @@ -343,6 +506,23 @@ static void esp_input_restore_header(struct sk_buff *skb) __skb_pull(skb, 4); } +static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + } +} + static void esp_input_done_esn(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -378,14 +558,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } - nfrags = skb_cow_data(skb, 0, &trailer); - if (nfrags < 0) { - ret = -EINVAL; - goto out; - } - - ret = -ENOMEM; - assoclen = sizeof(*esph); seqhilen = 0; @@ -394,6 +566,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) assoclen += seqhilen; } + if (!skb_cloned(skb)) { + if (!skb_is_nonlinear(skb)) { + nfrags = 1; + + goto skip_cow; + } else if (!skb_has_frag_list(skb)) { + nfrags = skb_shinfo(skb)->nr_frags; + nfrags++; + + goto skip_cow; + } + } + + nfrags = skb_cow_data(skb, 0, &trailer); + if (nfrags < 0) { + ret = -EINVAL; + goto out; + } + +skip_cow: + ret = -ENOMEM; tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -404,26 +597,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - skb->ip_summed = CHECKSUM_NONE; + esp_input_set_header(skb, seqhi); - esph = (struct ip_esp_hdr *)skb->data; + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); - aead_request_set_callback(req, 0, esp_input_done, skb); + skb->ip_summed = CHECKSUM_NONE; - /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after - * decryption. - */ - if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)skb_push(skb, 4); - *seqhi = esph->spi; - esph->spi = esph->seq_no; - esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; + if ((x->props.flags & XFRM_STATE_ESN)) aead_request_set_callback(req, 0, esp_input_done_esn, skb); - } - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + else + aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); aead_request_set_ad(req, assoclen); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c new file mode 100644 index 000000000000..d914eb93204a --- /dev/null +++ b/net/ipv6/esp6_offload.c @@ -0,0 +1,108 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET implementation + * + * Copyright (C) 2016 secunet Security Networks AG + * Author: Steffen Klassert <steffen.klassert@secunet.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * ESP GRO support + */ + +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <crypto/aead.h> +#include <crypto/authenc.h> +#include <linux/err.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <linux/scatterlist.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> +#include <linux/icmpv6.h> + +static struct sk_buff **esp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + int offset = skb_gro_offset(skb); + struct xfrm_offload *xo; + struct xfrm_state *x; + __be32 seq; + __be32 spi; + int err; + + skb_pull(skb, offset); + + if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + goto out; + + err = secpath_set(skb); + if (err) + goto out; + + if (skb->sp->len == XFRM_MAX_DEPTH) + goto out; + + x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); + if (!x) + goto out; + + skb->sp->xvec[skb->sp->len++] = x; + skb->sp->olen++; + + xo = xfrm_offload(skb); + if (!xo) { + xfrm_state_put(x); + goto out; + } + xo->flags |= XFRM_GRO; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_SPI_SKB_CB(skb)->seq = seq; + + /* We don't need to handle errors from xfrm_input, it does all + * the error handling and frees the resources on error. */ + xfrm_input(skb, IPPROTO_ESP, spi, -2); + + return ERR_PTR(-EINPROGRESS); +out: + skb_push(skb, offset); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 1; + + return NULL; +} + +static const struct net_offload esp6_offload = { + .callbacks = { + .gro_receive = esp6_gro_receive, + }, +}; + +static int __init esp6_offload_init(void) +{ + return inet6_add_offload(&esp6_offload, IPPROTO_ESP); +} + +static void __exit esp6_offload_exit(void) +{ + inet6_del_offload(&esp6_offload, IPPROTO_ESP); +} + +module_init(esp6_offload_init); +module_exit(esp6_offload_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index e4198502fd98..275cac628a95 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -327,7 +327,6 @@ static int ipv6_srh_rcv(struct sk_buff *skb) struct ipv6_sr_hdr *hdr; struct inet6_dev *idev; struct in6_addr *addr; - bool cleanup = false; int accept_seg6; hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); @@ -351,11 +350,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb) #endif looped_back: - if (hdr->segments_left > 0) { - if (hdr->nexthdr != NEXTHDR_IPV6 && hdr->segments_left == 1 && - sr_has_cleanup(hdr)) - cleanup = true; - } else { + if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6) { int offset = (hdr->hdrlen + 1) << 3; @@ -418,21 +413,6 @@ looped_back: ipv6_hdr(skb)->daddr = *addr; - if (cleanup) { - int srhlen = (hdr->hdrlen + 1) << 3; - int nh = hdr->nexthdr; - - skb_pull_rcsum(skb, sizeof(struct ipv6hdr) + srhlen); - memmove(skb_network_header(skb) + srhlen, - skb_network_header(skb), - (unsigned char *)hdr - skb_network_header(skb)); - skb->network_header += srhlen; - ipv6_hdr(skb)->nexthdr = nh; - ipv6_hdr(skb)->payload_len = htons(skb->len - - sizeof(struct ipv6hdr)); - skb_push_rcsum(skb, sizeof(struct ipv6hdr)); - } - skb_dst_drop(skb); ip6_route_input(skb); @@ -453,13 +433,8 @@ looped_back: } ipv6_hdr(skb)->hop_limit--; - /* be sure that srh is still present before reinjecting */ - if (!cleanup) { - skb_pull(skb, sizeof(struct ipv6hdr)); - goto looped_back; - } - skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + skb_pull(skb, sizeof(struct ipv6hdr)); + goto looped_back; } dst_input(skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3036f665e6c8..230b5aac9f03 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +/* Called with BH disabled */ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmpv6_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ - local_bh_enable(); return NULL; } return sk; @@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) static __inline__ void icmpv6_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } /* @@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } +static bool icmpv6_mask_allow(int type) +{ + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return true; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return true; + + return false; +} + +static bool icmpv6_global_allow(int type) +{ + if (icmpv6_mask_allow(type)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Check the ICMP output rate limit */ @@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) - return true; - - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + if (icmpv6_mask_allow(type)) return true; /* @@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - if (icmp_global_allow()) { - struct inet_peer *peer; - - peer = inet_getpeer_v6(net->ipv6.peers, - &fl6->daddr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); - } + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; @@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } + /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv6_global_allow(type)) + goto out_bh_enable; + mip6_addr_swap(skb); memset(&fl6, 0, sizeof(fl6)); @@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; + sk->sk_mark = mark; np = inet6_sk(sk); @@ -552,6 +573,8 @@ out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } /* Slightly more convenient version of icmp6_send. @@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; sk->sk_mark = mark; np = inet6_sk(sk); @@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index a7bc54ab46e2..ce1aae4a7fc8 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -115,7 +115,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; -static int ila_build_state(struct net_device *dev, struct nlattr *nla, +static int ila_build_state(struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -238,6 +238,7 @@ static const struct lwtunnel_encap_ops ila_encap_ops = { .fill_encap = ila_fill_encap_info, .get_encap_size = ila_encap_nlsize, .cmp_encap = ila_encap_cmp, + .owner = THIS_MODULE, }; int ila_lwt_init(void) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 7396e75e161b..9a31d13bf180 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -28,46 +28,6 @@ #include <net/inet6_connection_sock.h> #include <net/sock_reuseport.h> -int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool reuseport_ok) -{ - const struct sock *sk2; - bool reuse = !!sk->sk_reuse; - bool reuseport = !!sk->sk_reuseport && reuseport_ok; - kuid_t uid = sock_i_uid((struct sock *)sk); - - /* We must walk the whole port owner list in this case. -DaveM */ - /* - * See comment in inet_csk_bind_conflict about sock lookup - * vs net namespaces issues. - */ - sk_for_each_bound(sk2, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if ((!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - (!reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(uid, - sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2, true)) - break; - } - if (!relax && reuse && sk2->sk_reuse && - sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2, true)) - break; - } - } - - return sk2 != NULL; -} -EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); - struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, @@ -176,7 +136,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass); rcu_read_unlock(); return res; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 02761c9fe43e..d0900918a19e 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk) if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + err = __inet_hash(sk, NULL); local_bh_enable(); } return err; } EXPORT_SYMBOL_GPL(inet6_hash); - -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} -EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ef5485204522..e4266746e4a2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w) w->leaf = rt; return 1; } + + /* Multipath routes are dumped in one route with the + * RTA_MULTIPATH attribute. Jump 'rt' to point to the + * last sibling of this route (no need to dump the + * sibling routes again) + */ + if (rt->rt6i_nsiblings) + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, + rt6i_siblings); } w->leaf = NULL; return 0; @@ -746,6 +756,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, u16 nlflags = NLM_F_EXCL; int err; + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) + nlflags |= NLM_F_APPEND; + ins = &fn->leaf; for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { @@ -868,7 +881,8 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -894,7 +908,8 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -1439,7 +1454,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + if (!info->skip_notify) + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 75b6108234dd..6fcb7cb49bb2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -367,35 +367,37 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; - __be16 *p = (__be16 *)(skb->data + offset); - int grehlen = offset + 4; + const struct gre_base_hdr *greh; + const struct ipv6hdr *ipv6h; + int grehlen = sizeof(*greh); struct ip6_tnl *t; + int key_off = 0; __be16 flags; + __be32 key; - flags = p[0]; - if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { - if (flags&(GRE_VERSION|GRE_ROUTING)) - return; - if (flags&GRE_KEY) { - grehlen += 4; - if (flags&GRE_CSUM) - grehlen += 4; - } + if (!pskb_may_pull(skb, offset + grehlen)) + return; + greh = (const struct gre_base_hdr *)(skb->data + offset); + flags = greh->flags; + if (flags & (GRE_VERSION | GRE_ROUTING)) + return; + if (flags & GRE_CSUM) + grehlen += 4; + if (flags & GRE_KEY) { + key_off = grehlen + offset; + grehlen += 4; } - /* If only 8 bytes returned, keyed message will be dropped here */ - if (!pskb_may_pull(skb, grehlen)) + if (!pskb_may_pull(skb, offset + grehlen)) return; ipv6h = (const struct ipv6hdr *)skb->data; - p = (__be16 *)(skb->data + offset); + greh = (const struct gre_base_hdr *)(skb->data + offset); + key = key_off ? *(__be32 *)(skb->data + key_off) : 0; t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, - flags & GRE_KEY ? - *(((__be32 *)p) + (grehlen / 4) - 1) : 0, - p[1]); + key, greh->protocol); if (!t) return; @@ -484,11 +486,6 @@ drop: return 0; } -struct ipv6_tel_txoption { - struct ipv6_txoptions ops; - __u8 dst_opt[8]; -}; - static int gre_handle_offloads(struct sk_buff *skb, bool csum) { return iptunnel_handle_offloads(skb, @@ -582,6 +579,9 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) return -1; offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + ipv6h = ipv6_hdr(skb); + if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; @@ -998,6 +998,9 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } static int ip6gre_tunnel_init_common(struct net_device *dev) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 89c59e656f44..0838e6d01d2e 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { __pskb_pull(skb, skb_gro_offset(skb)); + skb_gro_frag0_invalidate(skb); proto = ipv6_gso_pull_exthdrs(skb, proto); skb_gro_pull(skb, -skb_transport_offset(skb)); skb_reset_transport_header(skb); @@ -252,7 +253,7 @@ out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 70d0de404197..528b3c1f3fde 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -119,7 +119,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { - ret = dst_neigh_output(dst, neigh, skb); + sock_confirm_neigh(skb, neigh); + ret = neigh_output(neigh, skb); rcu_read_unlock_bh(); return ret; } @@ -172,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -240,7 +241,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { @@ -1021,6 +1022,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, } } #endif + if (ipv6_addr_v4mapped(&fl6->saddr) && + !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { + err = -EAFNOSUPPORT; + goto out_err_release; + } return 0; @@ -1144,6 +1150,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb->protocol = htons(ETH_P_IPV6); skb->csum = 0; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + __skb_queue_tail(queue, skb); } else if (skb_is_gso(skb)) { goto append; @@ -1344,7 +1353,7 @@ emsgsize: */ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && - length < mtu - headersize && + length <= mtu - headersize && !(flags & MSG_MORE) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; @@ -1373,7 +1382,7 @@ emsgsize: */ cork->length += length; - if (((length > mtu) || + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && @@ -1516,6 +1525,9 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + if ((flags & MSG_CONFIRM) && !skb_prev) + skb_set_dst_pending_confirm(skb, 1); + /* * Put the packet on the pending queue */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 36d292180942..75fac933c209 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -400,18 +400,19 @@ ip6_tnl_dev_uninit(struct net_device *dev) __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; - __u8 nexthdr = ipv6h->nexthdr; - __u16 off = sizeof(*ipv6h); + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; + unsigned int nhoff = raw - skb->data; + unsigned int off = nhoff + sizeof(*ipv6h); + u8 next, nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { - __u16 optlen = 0; struct ipv6_opt_hdr *hdr; - if (raw + off + sizeof(*hdr) > skb->data && - !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + u16 optlen; + + if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; - hdr = (struct ipv6_opt_hdr *) (raw + off); + hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; if (frag_hdr->frag_off) @@ -422,20 +423,29 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) } else { optlen = ipv6_optlen(hdr); } + /* cache hdr->nexthdr, since pskb_may_pull() might + * invalidate hdr + */ + next = hdr->nexthdr; if (nexthdr == NEXTHDR_DEST) { - __u16 i = off + 2; + u16 i = 2; + + /* Remember : hdr is no longer valid at this point. */ + if (!pskb_may_pull(skb, off + optlen)) + break; + while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ - if (i + sizeof (*tel) > off + optlen) + if (i + sizeof(*tel) > optlen) break; - tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) - return i; + return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; @@ -443,7 +453,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) i++; } } - nexthdr = hdr->nexthdr; + nexthdr = next; off += optlen; } return 0; @@ -1108,7 +1118,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen; + mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1117,7 +1127,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; @@ -1303,6 +1313,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel = key->label; } else { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + ipv6h = ipv6_hdr(skb); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index f4b4a4a5f4ba..644ba59fbd9d 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -49,6 +49,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) @@ -189,12 +190,12 @@ static int vti6_tnl_create2(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; + dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); - dev->rtnl_link_ops = &vti6_link_ops; dev_hold(dev); vti6_tnl_link(ip6n, t); @@ -692,6 +693,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; + if (u->i_key) + u->i_flags |= GRE_KEY; + if (u->o_key) + u->o_flags |= GRE_KEY; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); @@ -842,6 +847,9 @@ static void vti6_dev_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } /** diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 604d8953c775..6ba6c900ebcf 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1666,6 +1666,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; @@ -1677,9 +1681,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns switch (optname) { case MRT6_INIT: - if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; if (optlen < sizeof(int)) return -EINVAL; @@ -1815,6 +1816,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; @@ -2243,8 +2248,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mf6c_parent >= MAXMIFS) + if (c->mf6c_parent >= MAXMIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; + } if (MIF_EXISTS(mrt, c->mf6c_parent) && nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) @@ -2286,7 +2293,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, - int nowait, u32 portid) + u32 portid) { int err; struct mr6_table *mrt; @@ -2313,11 +2320,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, struct net_device *dev; int vif; - if (nowait) { - read_unlock(&mrt_lock); - return -EAGAIN; - } - dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); @@ -2355,7 +2357,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, return err; } - if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + if (rtm->rtm_flags & RTM_F_NOTIFY) cache->mfc_flags |= MFC_NOTIFY; err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ee97c44e2aa0..a531ba032b85 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -595,16 +595,24 @@ done: if (val) { struct net_device *dev; + int midx; - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) - goto e_inval; + rcu_read_lock(); - dev = dev_get_by_index(net, val); + dev = dev_get_by_index_rcu(net, val); if (!dev) { + rcu_read_unlock(); retv = -ENODEV; break; } - dev_put(dev); + midx = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != val && + (!midx || midx != sk->sk_bound_dev_if)) + goto e_inval; } np->mcast_oif = val; retv = 0; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 14a3903f1c82..1bdc703cb966 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -81,7 +81,7 @@ static void mld_gq_timer_expire(unsigned long data); static void mld_ifc_timer_expire(unsigned long data); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); +static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); static void mld_clear_delrec(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); @@ -692,9 +692,9 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) dev_mc_del(dev, buf); } - if (mc->mca_flags & MAF_NOREPORT) - goto done; spin_unlock_bh(&mc->mca_lock); + if (mc->mca_flags & MAF_NOREPORT) + return; if (!mc->idev->dead) igmp6_leave_group(mc); @@ -702,8 +702,6 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) spin_lock_bh(&mc->mca_lock); if (del_timer(&mc->mca_timer)) atomic_dec(&mc->mca_refcnt); -done: - ip6_mc_clear_src(mc); spin_unlock_bh(&mc->mca_lock); } @@ -748,10 +746,11 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) spin_unlock_bh(&idev->mc_lock); } -static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) +static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc, *pmc_prev; - struct ip6_sf_list *psf, *psf_next; + struct ip6_sf_list *psf; + struct in6_addr *pmca = &im->mca_addr; spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; @@ -768,14 +767,21 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) } spin_unlock_bh(&idev->mc_lock); + spin_lock_bh(&im->mca_lock); if (pmc) { - for (psf = pmc->mca_tomb; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + im->idev = pmc->idev; + im->mca_crcount = idev->mc_qrv; + im->mca_sfmode = pmc->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + im->mca_tomb = pmc->mca_tomb; + im->mca_sources = pmc->mca_sources; + for (psf = im->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = im->mca_crcount; } in6_dev_put(pmc->idev); kfree(pmc); } + spin_unlock_bh(&im->mca_lock); } static void mld_clear_delrec(struct inet6_dev *idev) @@ -904,7 +910,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) mca_get(mc); write_unlock_bh(&idev->lock); - mld_del_delrec(idev, &mc->mca_addr); + mld_del_delrec(idev, mc); igmp6_group_added(mc); ma_put(mc); return 0; @@ -927,6 +933,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); igmp6_group_dropped(ma); + ip6_mc_clear_src(ma); ma_put(ma); return 0; @@ -2501,15 +2508,17 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Withdraw multicast list */ read_lock_bh(&idev->lock); - mld_ifc_stop_timer(idev); - mld_gq_stop_timer(idev); - mld_dad_stop_timer(idev); for (i = idev->mc_list; i; i = i->next) igmp6_group_dropped(i); - read_unlock_bh(&idev->lock); - mld_clear_delrec(idev); + /* Should stop timer after group drop. or we will + * start timer again in mld_ifc_event() + */ + mld_ifc_stop_timer(idev); + mld_gq_stop_timer(idev); + mld_dad_stop_timer(idev); + read_unlock_bh(&idev->lock); } static void ipv6_mc_reset(struct inet6_dev *idev) @@ -2531,8 +2540,10 @@ void ipv6_mc_up(struct inet6_dev *idev) read_lock_bh(&idev->lock); ipv6_mc_reset(idev); - for (i = idev->mc_list; i; i = i->next) + for (i = idev->mc_list; i; i = i->next) { + mld_del_delrec(idev, i); igmp6_group_added(i); + } read_unlock_bh(&idev->lock); } @@ -2565,6 +2576,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) /* Deactivate timers */ ipv6_mc_down(idev); + mld_clear_delrec(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in @@ -2579,11 +2591,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) write_lock_bh(&idev->lock); while ((i = idev->mc_list) != NULL) { idev->mc_list = i->next; - write_unlock_bh(&idev->lock); - igmp6_group_dropped(i); + write_unlock_bh(&idev->lock); ma_put(i); - write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 25a022d41a70..1e15c54fd5e2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -855,10 +855,6 @@ copy_entries_to_user(unsigned int total_size, return PTR_ERR(counters); loc_cpu_entry = private->entries; - if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { - ret = -EFAULT; - goto free_counters; - } /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ @@ -868,6 +864,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_target *t; e = (struct ip6t_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off, e, sizeof(*e))) { + ret = -EFAULT; + goto free_counters; + } if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -881,23 +881,14 @@ copy_entries_to_user(unsigned int total_size, i += m->u.match_size) { m = (void *)e + i; - if (copy_to_user(userptr + off + i - + offsetof(struct xt_entry_match, - u.user.name), - m->u.kernel.match->name, - strlen(m->u.kernel.match->name)+1) - != 0) { + if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ip6t_get_target_c(e); - if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct xt_entry_target, - u.user.name), - t->u.kernel.target->name, - strlen(t->u.kernel.target->name)+1) != 0) { + if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 590f767db5d4..a379d2f79b19 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -112,6 +112,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), + .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | @@ -123,6 +124,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), + .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_PRE_ROUTING) | diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 98c8dd38575a..4ef1ddd4bbbd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -71,8 +71,7 @@ synproxy_send_tcp(struct net *net, skb_dst_set(nskb, dst); if (nfct) { - nskb->nfct = nfct; - nskb->nfctinfo = ctinfo; + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); nf_conntrack_get(nfct); } @@ -121,8 +120,8 @@ synproxy_send_client_synack(struct net *net, synproxy_build_options(nth, opts); - synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void @@ -244,8 +243,8 @@ synproxy_send_client_ack(struct net *net, synproxy_build_options(nth, opts); - synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d5263dc364a9..b12e61b7b16c 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, return ret; } -static bool rpfilter_is_local(const struct sk_buff *skb) +static bool +rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { - const struct rt6_info *rt = (const void *) skb_dst(skb); - return rt && (rt->rt6i_flags & RTF_LOCAL); + return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) struct ipv6hdr *iph; bool invert = info->flags & XT_RPFILTER_INVERT; - if (rpfilter_is_local(skb)) + if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ipv6_hdr(skb); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index f5a61bc3ec2b..d2c2ccbfbe72 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -145,15 +145,15 @@ static int icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int icmp6off, - enum ip_conntrack_info *ctinfo, unsigned int hooknum) { struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; + enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; - NF_CT_ASSERT(skb->nfct == NULL); + NF_CT_ASSERT(!skb_nfct(skb)); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, @@ -176,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } - *ctinfo = IP_CT_RELATED; + ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), &intuple); @@ -185,19 +185,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } else { if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - *ctinfo += IP_CT_IS_REPLY; + ctinfo += IP_CT_IS_REPLY; } /* Update skb to refer to this connection */ - skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; - skb->nfctinfo = *ctinfo; + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); return NF_ACCEPT; } static int icmpv6_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) + u8 pf, unsigned int hooknum) { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; @@ -222,9 +221,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); + nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); + nf_conntrack_get(skb_nfct(skb)); return NF_ACCEPT; } @@ -232,7 +230,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); + return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 9948b5ce52da..986d4ca38832 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 8e0bdd058787..ada60d1a991b 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -37,7 +37,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) { + if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); @@ -61,7 +61,7 @@ static unsigned int ipv6_defrag(void *priv, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ - if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 4a84b5ad9ecb..888ecd106e5f 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -57,10 +57,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, return; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); + nf_reset(skb); + nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); + nf_conntrack_get(skb_nfct(skb)); #endif if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_IN) { diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 57d86066a13b..97c724224da7 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", ntohs(ih->payload_len) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, @@ -351,7 +351,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, struct nf_log_buf *m; /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) return; m = nf_log_buf_open(); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 10090400c72f..eedee5d108d9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); + fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { @@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) skb_dst_set(nskb, dst); + nskb->mark = fl6.flowi6_mark; + skb_reserve(nskb, hh_len + dst->header_len); ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c947aad8bcc6..765facf03d45 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -18,13 +18,6 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> -static bool fib6_is_local(const struct sk_buff *skb) -{ - const struct rt6_info *rt = (const void *)skb_dst(skb); - - return rt && (rt->rt6i_flags & RTF_LOCAL); -} - static int get_ifindex(const struct net_device *dev) { return dev ? dev->ifindex : 0; @@ -164,8 +157,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif); - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib6_is_local(pkt->skb)) { - nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && + nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { + nft_fib_store_result(dest, priv->result, pkt, + nft_in(pkt)->ifindex); return; } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index e1f8b34d7a2e..9b522fa90e6d 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -126,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return PTR_ERR(dst); rt = (struct rt6_info *) dst; - np = inet6_sk(sk); - if (!np) { - err = -EBADF; - goto dst_err_out; - } - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) @@ -166,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } release_sock(sk); -dst_err_out: dst_release(dst); if (err) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ea89073c8247..f174e76e6505 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); if (err) @@ -934,7 +937,8 @@ out: txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8417c41d8ec8..43ca90d50ae9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -98,6 +98,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); +static size_t rt6_nlmsg_size(struct rt6_info *rt); +static int rt6_fill_node(struct net *net, + struct sk_buff *skb, struct rt6_info *rt, + struct in6_addr *dst, struct in6_addr *src, + int iif, int type, u32 portid, u32 seq, + unsigned int flags); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -217,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, return neigh_create(&nd_tbl, daddr, dst->dev); } +static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + struct net_device *dev = dst->dev; + struct rt6_info *rt = (struct rt6_info *)dst; + + daddr = choose_neigh_daddr(rt, NULL, daddr); + if (!daddr) + return; + if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) + return; + if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) + return; + __ipv6_confirm_neigh(dev, daddr); +} + static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, @@ -233,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = { .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_neigh_lookup, + .confirm_neigh = ip6_confirm_neigh, }; static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) @@ -1359,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu) { + const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; if (rt6->rt6i_flags & RTF_LOCAL) @@ -1367,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (dst_metric_locked(dst, RTAX_MTU)) return; - dst_confirm(dst); + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + daddr = NULL; + saddr = NULL; + } + dst_confirm_neigh(dst, daddr); mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); - } else { - const struct in6_addr *daddr, *saddr; + } else if (daddr) { struct rt6_info *nrt6; - if (iph) { - daddr = &iph->daddr; - saddr = &iph->saddr; - } else if (sk) { - daddr = &sk->sk_v6_daddr; - saddr = &inet6_sk(sk)->saddr; - } else { - return; - } nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -1464,7 +1487,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_node *fn; /* Get the "current" route for this destination and - * check if the redirect has come from approriate router. + * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -1897,7 +1920,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; - err = lwtunnel_build_state(dev, cfg->fc_encap_type, + err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET6, cfg, &lwtstate); if (err) @@ -2143,6 +2166,58 @@ int ip6_del_rt(struct rt6_info *rt) return __ip6_del_rt(rt, &info); } +static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) +{ + struct nl_info *info = &cfg->fc_nlinfo; + struct net *net = info->nl_net; + struct sk_buff *skb = NULL; + struct fib6_table *table; + int err = -ENOENT; + + if (rt == net->ipv6.ip6_null_entry) + goto out_put; + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); + + if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { + struct rt6_info *sibling, *next_sibling; + + /* prefer to send a single notification with all hops */ + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (skb) { + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + + if (rt6_fill_node(net, skb, rt, + NULL, NULL, 0, RTM_DELROUTE, + info->portid, seq, 0) < 0) { + kfree_skb(skb); + skb = NULL; + } else + info->skip_notify = 1; + } + + list_for_each_entry_safe(sibling, next_sibling, + &rt->rt6i_siblings, + rt6i_siblings) { + err = fib6_del(sibling, info); + if (err) + goto out_unlock; + } + } + + err = fib6_del(rt, info); +out_unlock: + write_unlock_bh(&table->tb6_lock); +out_put: + ip6_rt_put(rt); + + if (skb) { + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + } + return err; +} + static int ip6_route_del(struct fib6_config *cfg) { struct fib6_table *table; @@ -2179,7 +2254,11 @@ static int ip6_route_del(struct fib6_config *cfg) dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + /* if gateway was specified only delete the one hop */ + if (cfg->fc_flags & RTF_GATEWAY) + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + + return __ip6_del_rt_siblings(rt, cfg); } } read_unlock_bh(&table->tb6_lock); @@ -2258,7 +2337,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ - dst_confirm(&rt->dst); + dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) @@ -2634,6 +2713,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->dst.output = ip6_output; rt->rt6i_idev = idev; + rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) rt->rt6i_flags |= RTF_ANYCAST; @@ -2711,13 +2791,16 @@ struct arg_dev_net { struct net *net; }; +/* called with write lock held for table with rt */ static int fib6_ifdown(struct rt6_info *rt, void *arg) { const struct arg_dev_net *adn = arg; const struct net_device *dev = adn->dev; if ((rt->dst.dev == dev || !dev) && - rt != adn->net->ipv6.ip6_null_entry) + rt != adn->net->ipv6.ip6_null_entry && + (rt->rt6i_nsiblings == 0 || + !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) return -1; return 0; @@ -2768,7 +2851,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) old MTU is the lowest MTU in the path, update the route PMTU to reflect the increase. In this case if the other nodes' MTU also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discouvery. + PMTU discovery. */ if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && @@ -2812,6 +2895,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, + [RTA_MARK] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2896,6 +2980,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); + + err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, + cfg->fc_mp_len); + if (err < 0) + goto errout; } if (tb[RTA_PREF]) { @@ -2909,9 +2998,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; - if (tb[RTA_ENCAP_TYPE]) + if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + if (err < 0) + goto errout; + } + if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); @@ -2938,7 +3032,7 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) struct rt6_nh *nh; list_for_each_entry(nh, rt6_nh_list, next) { - pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, nh->r_cfg.fc_ifindex); } @@ -2977,13 +3071,37 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, return 0; } +static void ip6_route_mpath_notify(struct rt6_info *rt, + struct rt6_info *rt_last, + struct nl_info *info, + __u16 nlflags) +{ + /* if this is an APPEND route, then rt points to the first route + * inserted and rt_last points to last route inserted. Userspace + * wants a consistent dump of the route which starts at the first + * nexthop. Since sibling routes are always added at the end of + * the list, find the first sibling of the last route appended + */ + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { + rt = list_first_entry(&rt_last->rt6i_siblings, + struct rt6_info, + rt6i_siblings); + } + + if (rt) + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); +} + static int ip6_route_multipath_add(struct fib6_config *cfg) { + struct rt6_info *rt_notif = NULL, *rt_last = NULL; + struct nl_info *info = &cfg->fc_nlinfo; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct rt6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; + __u16 nlflags; int remaining; int attrlen; int err = 1; @@ -2992,6 +3110,10 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); LIST_HEAD(rt6_nh_list); + nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; + if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) + nlflags |= NLM_F_APPEND; + remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; @@ -3034,9 +3156,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) rtnh = rtnh_next(rtnh, &remaining); } + /* for add and replace send one notification with all nexthops. + * Skip the notification in fib6_add_rt2node and send one with + * the full route when done + */ + info->skip_notify = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + rt_last = nh->rt6_info; + err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc); + /* save reference to first route for notification */ + if (!rt_notif && !err) + rt_notif = nh->rt6_info; + /* nh->rt6_info is used or freed at this point, reset to NULL*/ nh->rt6_info = NULL; if (err) { @@ -3058,9 +3191,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) nhn++; } + /* success ... tell user about new route */ + ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; add_errout: + /* send notification for routes that were added so that + * the delete notifications sent by ip6_route_del are + * coherent + */ + if (rt_notif) + ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); + /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, next) { if (err_nh == nh) @@ -3128,8 +3270,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) if (cfg.fc_mp) return ip6_route_multipath_del(&cfg); - else + else { + cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg); + } } static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -3147,8 +3291,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(struct rt6_info *rt) +static size_t rt6_nlmsg_size(struct rt6_info *rt) { + int nexthop_len = 0; + + if (rt->rt6i_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + nla_total_size(4) /* RTA_OIF */ + + lwtunnel_get_encap_size(rt->dst.lwtstate); + + nexthop_len *= rt->rt6i_nsiblings; + } + return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ @@ -3162,14 +3318,69 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->dst.lwtstate); + + lwtunnel_get_encap_size(rt->dst.lwtstate) + + nexthop_len; +} + +static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, + unsigned int *flags) +{ + if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + *flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + *flags |= RTNH_F_DEAD; + } + + if (rt->rt6i_flags & RTF_GATEWAY) { + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) + goto nla_put_failure; + } + + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; + + if (rt->dst.lwtstate && + lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) +{ + struct rtnexthop *rtnh; + unsigned int flags = 0; + + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (!rtnh) + goto nla_put_failure; + + rtnh->rtnh_hops = 0; + rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; + + if (rt6_nexthop_info(skb, rt, &flags) < 0) + goto nla_put_failure; + + rtnh->rtnh_flags = flags; + + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; + + return 0; + +nla_put_failure: + return -EMSGSIZE; } static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, - int prefix, int nowait, unsigned int flags) + unsigned int flags) { u32 metrics[RTAX_MAX]; struct rtmsg *rtm; @@ -3177,13 +3388,6 @@ static int rt6_fill_node(struct net *net, long expires; u32 table; - if (prefix) { /* user wants prefix routes only */ - if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { - /* success since this is not a prefix route */ - return 1; - } - } - nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; @@ -3223,11 +3427,6 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (!netif_carrier_ok(rt->dst.dev)) { - rtm->rtm_flags |= RTNH_F_LINKDOWN; - if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) - rtm->rtm_flags |= RTNH_F_DEAD; - } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) @@ -3261,19 +3460,12 @@ static int rt6_fill_node(struct net *net, if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { - int err = ip6mr_get_route(net, skb, rtm, nowait, - portid); - - if (err <= 0) { - if (!nowait) { - if (err == 0) - return 0; - goto nla_put_failure; - } else { - if (err == -EMSGSIZE) - goto nla_put_failure; - } - } + int err = ip6mr_get_route(net, skb, rtm, portid); + + if (err == 0) + return 0; + if (err < 0) + goto nla_put_failure; } else #endif if (nla_put_u32(skb, RTA_IIF, iif)) @@ -3298,17 +3490,35 @@ static int rt6_fill_node(struct net *net, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) - goto nla_put_failure; - } - - if (rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) - goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) goto nla_put_failure; + /* For multipath routes, walk the siblings list and add + * each as a nexthop within RTA_MULTIPATH. + */ + if (rt->rt6i_nsiblings) { + struct rt6_info *sibling, *next_sibling; + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (rt6_add_nexthop(skb, rt) < 0) + goto nla_put_failure; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->rt6i_siblings, rt6i_siblings) { + if (rt6_add_nexthop(skb, sibling) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, mp); + } else { + if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0) + goto nla_put_failure; + } + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) @@ -3317,7 +3527,6 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - lwtunnel_fill_encap(skb, rt->dst.lwtstate); nlmsg_end(skb, nlh); return 0; @@ -3330,18 +3539,26 @@ nla_put_failure: int rt6_dump_route(struct rt6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; - int prefix; + struct net *net = arg->net; + + if (rt == net->ipv6.ip6_null_entry) + return 0; if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); - prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; - } else - prefix = 0; - return rt6_fill_node(arg->net, + /* user wants prefix routes only */ + if (rtm->rtm_flags & RTM_F_PREFIX && + !(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } + + return rt6_fill_node(net, arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, - prefix, 0, NLM_F_MULTI); + NLM_F_MULTI); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) @@ -3422,17 +3639,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) goto errout; } - /* Reserve room for dummy headers, this skb can pass - through good chunk of routing engine. - */ - skb_reset_mac_header(skb); - skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); - skb_dst_set(skb, &rt->dst); err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, 0, 0); + nlh->nlmsg_seq, 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -3459,7 +3670,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, nlm_flags); + event, info->portid, seq, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index b172d85c650a..a855eb325b03 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -176,6 +176,8 @@ static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info) val = nla_data(info->attrs[SEG6_ATTR_DST]); t_new = kmemdup(val, sizeof(*val), GFP_KERNEL); + if (!t_new) + return -ENOMEM; mutex_lock(&sdata->lock); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index ef1c8a46e7ac..f950cb53d5e3 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -45,7 +45,7 @@ #include <net/seg6_hmac.h> #include <linux/random.h> -static char * __percpu *hmac_ring; +static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -174,7 +174,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, * hash function (RadioGatun) with up to 1216 bits */ - /* saddr(16) + first_seg(1) + cleanup(1) + keyid(4) + seglist(16n) */ + /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; /* this limit allows for 14 segments */ @@ -186,13 +186,13 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, * * 1. Source IPv6 address (128 bits) * 2. first_segment value (8 bits) - * 3. cleanup flag (8 bits: highest bit is cleanup value, others are 0) + * 3. Flags (8 bits) * 4. HMAC Key ID (32 bits) * 5. All segments in the segments list (n * 128 bits) */ local_bh_disable(); - ring = *this_cpu_ptr(hmac_ring); + ring = this_cpu_ptr(hmac_ring); off = ring; /* source address */ @@ -202,8 +202,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, /* first_segment value */ *off++ = hdr->first_segment; - /* cleanup flag */ - *off++ = !!(sr_has_cleanup(hdr)) << 7; + /* flags */ + *off++ = hdr->flags; /* HMAC Key ID */ memcpy(off, &hmackeyid, 4); @@ -353,27 +353,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_ring(void) -{ - int i; - - hmac_ring = alloc_percpu(char *); - - if (!hmac_ring) - return -ENOMEM; - - for_each_possible_cpu(i) { - char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL); - - if (!ring) - return -ENOMEM; - - *per_cpu_ptr(hmac_ring, i) = ring; - } - - return 0; -} - static int seg6_hmac_init_algo(void) { struct seg6_hmac_algo *algo; @@ -400,7 +379,7 @@ static int seg6_hmac_init_algo(void) *p_tfm = tfm; } - p_tfm = this_cpu_ptr(algo->tfms); + p_tfm = raw_cpu_ptr(algo->tfms); tfm = *p_tfm; shsize = sizeof(*shash) + crypto_shash_descsize(tfm); @@ -410,7 +389,8 @@ static int seg6_hmac_init_algo(void) return -ENOMEM; for_each_possible_cpu(cpu) { - shash = kzalloc(shsize, GFP_KERNEL); + shash = kzalloc_node(shsize, GFP_KERNEL, + cpu_to_node(cpu)); if (!shash) return -ENOMEM; *per_cpu_ptr(algo->shashs, cpu) = shash; @@ -422,16 +402,7 @@ static int seg6_hmac_init_algo(void) int __init seg6_hmac_init(void) { - int ret; - - ret = seg6_hmac_init_ring(); - if (ret < 0) - goto out; - - ret = seg6_hmac_init_algo(); - -out: - return ret; + return seg6_hmac_init_algo(); } EXPORT_SYMBOL(seg6_hmac_init); @@ -450,13 +421,6 @@ void seg6_hmac_exit(void) struct seg6_hmac_algo *algo = NULL; int i, alg_count, cpu; - for_each_possible_cpu(i) { - char *ring = *per_cpu_ptr(hmac_ring, i); - - kfree(ring); - } - free_percpu(hmac_ring); - alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index bbfca22c34ae..85582257d3af 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -55,8 +55,8 @@ static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = { [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; -int nla_put_srh(struct sk_buff *skb, int attrtype, - struct seg6_iptunnel_encap *tuninfo) +static int nla_put_srh(struct sk_buff *skb, int attrtype, + struct seg6_iptunnel_encap *tuninfo) { struct seg6_iptunnel_encap *data; struct nlattr *nla; @@ -235,7 +235,7 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } -int seg6_input(struct sk_buff *skb) +static int seg6_input(struct sk_buff *skb) { int err; @@ -251,7 +251,7 @@ int seg6_input(struct sk_buff *skb) return dst_input(skb); } -int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; @@ -265,7 +265,9 @@ int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); #ifdef CONFIG_DST_CACHE + preempt_disable(); dst = dst_cache_get(&slwt->cache); + preempt_enable(); #endif if (unlikely(!dst)) { @@ -286,7 +288,9 @@ int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } #ifdef CONFIG_DST_CACHE + preempt_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + preempt_enable(); #endif } @@ -299,7 +303,7 @@ drop: return err; } -static int seg6_build_state(struct net_device *dev, struct nlattr *nla, +static int seg6_build_state(struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -418,6 +422,7 @@ static const struct lwtunnel_encap_ops seg6_iptun_ops = { .fill_encap = seg6_fill_encap_info, .get_encap_size = seg6_encap_nlsize, .cmp_encap = seg6_encap_cmp, + .owner = THIS_MODULE, }; int __init seg6_iptunnel_init(void) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fad992ad4bc8..99853c6e33a8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1380,6 +1380,7 @@ static int ipip6_tunnel_init(struct net_device *dev) err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) { free_percpu(dev->tstats); + dev->tstats = NULL; return err; } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a4d49760bf43..895ff650db43 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,7 +16,7 @@ #include <linux/tcp.h> #include <linux/random.h> -#include <linux/cryptohash.h> +#include <linux/siphash.h> #include <linux/kernel.h> #include <net/ipv6.h> #include <net/tcp.h> @@ -24,7 +24,7 @@ #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; +static siphash_key_t syncookie6_secret[2] __read_mostly; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] @@ -41,30 +41,27 @@ static __u16 const msstab[] = { 9000 - 60, }; -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); - -static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, +static u32 cookie_hash(const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + u32 count; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *saddr, + .daddr = *daddr, + .count = count, + .sport = sport, + .dport = dport + }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); - - tmp = this_cpu_ptr(ipv6_cookie_scratch); - - /* - * we have 320 bits of information to hash, copy in the remaining - * 192 bits required for sha_transform, from the syncookie6_secret - * and overwrite the digest with the secret - */ - memcpy(tmp + 10, syncookie6_secret[c], 44); - memcpy(tmp, saddr, 16); - memcpy(tmp + 4, daddr, 16); - tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; - tmp[9] = count; - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); - - return tmp[17]; + return siphash(&combined, offsetofend(typeof(combined), dport), + &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 73bc8fc68acd..60a5295a7de6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -122,7 +122,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct flowi6 fl6; struct dst_entry *dst; int addr_type; + u32 seq; int err; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -148,8 +150,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 0x1; + if (ipv6_addr_any(&usin->sin6_addr)) { + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + &usin->sin6_addr); + else + usin->sin6_addr = in6addr_loopback; + } addr_type = ipv6_addr_type(&usin->sin6_addr); @@ -188,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * TCP over IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED) { + if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; @@ -258,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) tcp_fetch_timewait_stamp(sk, dst); @@ -273,18 +280,26 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = inet6_hash_connect(&tcp_death_row, sk); + err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); - if (!tp->write_seq && likely(!tp->repair)) - tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport, - &tp->tsoffset); + if (likely(!tp->repair)) { + seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport, + &tp->tsoffset); + if (!tp->write_seq) + tp->write_seq = seq; + } + + if (tcp_fastopen_defer_connect(sk, &err)) + return err; + if (err) + goto late_failure; err = tcp_connect(sk); if (err) @@ -294,7 +309,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, late_failure: tcp_set_state(sk, TCP_CLOSE); - __sk_dst_reset(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; @@ -469,7 +483,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -840,7 +854,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -991,6 +1005,16 @@ drop: return 0; /* don't send reset */ } +static void tcp_v6_restore_cb(struct sk_buff *skb) +{ + /* We need to move header back to the beginning if xfrm6_policy_check() + * and tcp_v6_fill_cb() are going to be called again. + * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. + */ + memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, + sizeof(struct inet6_skb_parm)); +} + static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, @@ -1142,10 +1166,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric_advmss(dst); - if (tcp_sk(sk)->rx_opt.user_mss && - tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) - newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -1182,8 +1203,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * sk_gfp_mask(sk, GFP_ATOMIC)); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) + if (newnp->pktoptions) { + tcp_v6_restore_cb(newnp->pktoptions); skb_set_owner_r(newnp->pktoptions, newsk); + } } } @@ -1198,16 +1221,6 @@ out: return NULL; } -static void tcp_v6_restore_cb(struct sk_buff *skb) -{ - /* We need to move header back to the beginning if xfrm6_policy_check() - * and tcp_v6_fill_cb() are going to be called again. - * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. - */ - memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, - sizeof(struct inet6_skb_parm)); -} - /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * @@ -1620,7 +1633,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -1651,7 +1663,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -1744,7 +1755,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; @@ -1888,6 +1899,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, @@ -1948,7 +1960,7 @@ static void __net_exit tcpv6_net_exit(struct net *net) static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); + inet_twsk_purge(&tcp_hashinfo, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4d5c4eee4b3f..4e4c401e3bc6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -55,6 +55,16 @@ #include <trace/events/skb.h> #include "udp_impl.h" +static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if defined(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_udp_l3mdev_accept && + skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) + return true; +#endif + return false; +} + static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -103,7 +113,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; - return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); + return udp_lib_get_port(sk, snum, hash2_nulladdr); } static void udp_v6_rehash(struct sock *sk) @@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif) + int dif, bool exact_dif) { int score; struct inet_sock *inet; @@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score++; @@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, + bool exact_dif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; @@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + bool exact_dif = udp6_lib_exact_dif_match(net, skb); int score, badness, matches = 0, reuseport = 0; u32 hash = 0; @@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net, goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; @@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, + skb); } return result; } @@ -247,7 +259,8 @@ begin: result = NULL; badness = -1; sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); + score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, + exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -441,7 +454,7 @@ try_again: return err; csum_copy_err: - if (!__sk_queue_drop_skb(sk, skb, flags)) { + if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) { if (is_udp4) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); @@ -1033,6 +1046,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; daddr = &sin6->sin6_addr; + if (ipv6_addr_any(daddr) && + ipv6_addr_v4mapped(&np->saddr)) + ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), + daddr); break; case AF_INET: goto do_udp_sendmsg; @@ -1295,7 +1312,8 @@ out: return err; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index b5789562aded..08a807b29298 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -33,6 +33,8 @@ EXPORT_SYMBOL(xfrm6_rcv_spi); int xfrm6_transport_finish(struct sk_buff *skb, int async) { + struct xfrm_offload *xo = xfrm_offload(skb); + skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; @@ -44,6 +46,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); + if (xo && (xo->flags & XFRM_GRO)) { + skb_mac_header_rebuild(skb); + return -1; + } + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_rcv_finish); @@ -69,18 +76,9 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, struct xfrm_state *x = NULL; int i = 0; - /* Allocate new secpath or COW existing one. */ - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - - sp = secpath_dup(skb->sp); - if (!sp) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); - goto drop; - } - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; + if (secpath_set(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; } if (1 + skb->sp->len == XFRM_MAX_DEPTH) { diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 4e344105b3fd..4439ee44c8b0 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -47,6 +47,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); + struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -55,7 +56,8 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - skb_reset_transport_header(skb); + if (!xo || !(xo->flags & XFRM_GRO)) + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index e0f71c01d728..79651bc71bf0 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -25,8 +25,6 @@ #include <net/mip6.h> #endif -static struct xfrm_policy_afinfo xfrm6_policy_afinfo; - static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) @@ -220,7 +218,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); - xfrm6_policy_afinfo.garbage_collect(net); + xfrm_garbage_collect_deferred(net); return dst_entries_get_fast(ops) > ops->gc_thresh * 2; } @@ -291,8 +289,7 @@ static struct dst_ops xfrm6_dst_ops_template = { .gc_thresh = INT_MAX, }; -static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { - .family = AF_INET6, +static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, @@ -305,7 +302,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { static int __init xfrm6_policy_init(void) { - return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); + return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index 54d13f8dbbae..b2dc8ce49378 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -162,9 +162,8 @@ static const struct inet6_protocol ipcomp6_protocol = { .flags = INET6_PROTO_NOPOLICY, }; -static struct xfrm_input_afinfo xfrm6_input_afinfo = { +static const struct xfrm_input_afinfo xfrm6_input_afinfo = { .family = AF_INET6, - .owner = THIS_MODULE, .callback = xfrm6_rcv_cb, }; diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 1215693fdd22..35dbf3dc3d28 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -51,7 +51,7 @@ irnet_ctrl_write(irnet_socket * ap, char * next; /* Next command to process */ int length; /* Length of current command */ - DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count); /* Check for overflow... */ DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM, @@ -66,7 +66,7 @@ irnet_ctrl_write(irnet_socket * ap, /* Safe terminate the string */ command[count] = '\0'; - DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n", + DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n", command, count); /* Check every commands in the command line */ @@ -285,7 +285,7 @@ irnet_ctrl_read(irnet_socket * ap, char event[75]; ssize_t ret = 0; - DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count); #ifdef INITIAL_DISCOVERY /* Check if we have read the log */ @@ -328,7 +328,7 @@ irnet_ctrl_read(irnet_socket * ap, if(ret != 0) { /* No, return the error code */ - DEXIT(CTRL_TRACE, " - ret %Zd\n", ret); + DEXIT(CTRL_TRACE, " - ret %zd\n", ret); return ret; } @@ -568,7 +568,7 @@ dev_irnet_write(struct file * file, { irnet_socket * ap = file->private_data; - DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n", file, ap, count); DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); @@ -592,7 +592,7 @@ dev_irnet_read(struct file * file, { irnet_socket * ap = file->private_data; - DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n", file, ap, count); DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c index acbe61c7e683..160dc89335e2 100644 --- a/net/irda/irqueue.c +++ b/net/irda/irqueue.c @@ -383,9 +383,6 @@ EXPORT_SYMBOL(hashbin_new); * for deallocating this structure if it's complex. If not the user can * just supply kfree, which should take care of the job. */ -#ifdef CONFIG_LOCKDEP -static int hashbin_lock_depth = 0; -#endif int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) { irda_queue_t* queue; @@ -396,22 +393,27 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;); /* Synchronize */ - if ( hashbin->hb_type & HB_LOCK ) { - spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags, - hashbin_lock_depth++); - } + if (hashbin->hb_type & HB_LOCK) + spin_lock_irqsave(&hashbin->hb_spinlock, flags); /* * Free the entries in the hashbin, TODO: use hashbin_clear when * it has been shown to work */ for (i = 0; i < HASHBIN_SIZE; i ++ ) { - queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); - while (queue ) { - if (free_func) - (*free_func)(queue); - queue = dequeue_first( - (irda_queue_t**) &hashbin->hb_queue[i]); + while (1) { + queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); + + if (!queue) + break; + + if (free_func) { + if (hashbin->hb_type & HB_LOCK) + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + free_func(queue); + if (hashbin->hb_type & HB_LOCK) + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } } } @@ -420,12 +422,8 @@ int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) hashbin->magic = ~HB_MAGIC; /* Release lock */ - if ( hashbin->hb_type & HB_LOCK) { + if (hashbin->hb_type & HB_LOCK) spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); -#ifdef CONFIG_LOCKDEP - hashbin_lock_depth--; -#endif - } /* * Free the hashbin structure diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cfb9e5f4e28f..13190b38f22e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1044,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); - size_t headroom, linear; + size_t headroom = 0; + size_t linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1122,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) - ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; - if (headroom + len < PAGE_SIZE) { + if (iucv->transport == AF_IUCV_TRANS_HIPER) { + headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; linear = len; } else { - /* In nonlinear "classic" iucv skb, - * reserve space for iucv_array - */ - if (iucv->transport != AF_IUCV_TRANS_HIPER) - headroom += sizeof(struct iucv_array) * - (MAX_SKB_FRAGS + 1); - linear = PAGE_SIZE - headroom; + if (len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + headroom = sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } } skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, noblock, &err, 0); diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 7e08a4d3d77d..a646f3481240 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -929,23 +929,25 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out_error; } - /* New message, alloc head skb */ - head = alloc_skb(0, sk->sk_allocation); - while (!head) { - kcm_push(kcm); - err = sk_stream_wait_memory(sk, &timeo); - if (err) - goto out_error; - + if (msg_data_left(msg)) { + /* New message, alloc head skb */ head = alloc_skb(0, sk->sk_allocation); - } + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; - skb = head; + head = alloc_skb(0, sk->sk_allocation); + } - /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling - * csum_and_copy_from_iter from skb_do_copy_data_nocache. - */ - skb->ip_summed = CHECKSUM_UNNECESSARY; + skb = head; + + /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling + * csum_and_copy_from_iter from skb_do_copy_data_nocache. + */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + } start: while (msg_data_left(msg)) { @@ -1018,10 +1020,12 @@ wait_for_memory: if (eor) { bool not_busy = skb_queue_empty(&sk->sk_write_queue); - /* Message complete, queue it on send buffer */ - __skb_queue_tail(&sk->sk_write_queue, head); - kcm->seq_skb = NULL; - KCM_STATS_INCR(kcm->stats.tx_msgs); + if (head) { + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + } if (msg->msg_flags & MSG_BATCH) { kcm->tx_wait_more = true; @@ -1040,8 +1044,10 @@ wait_for_memory: } else { /* Message not complete, save state */ partial_message: - kcm->seq_skb = head; - kcm_tx_msg(head)->last_skb = skb; + if (head) { + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } } KCM_STATS_ADD(kcm->stats.tx_bytes, copied); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 85948c69b236..8adab6335ced 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1058,10 +1058,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Debug */ if (session->send_seq) - l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n", + l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n", session->name, data_len, session->ns - 1); else - l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n", + l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n", session->name, data_len); if (session->debug & L2TP_MSG_DATA) { @@ -1317,6 +1317,9 @@ static void l2tp_tunnel_del_work(struct work_struct *work) struct sock *sk = NULL; tunnel = container_of(work, struct l2tp_tunnel, del_work); + + l2tp_tunnel_closeall(tunnel); + sk = l2tp_tunnel_sock_lookup(tunnel); if (!sk) goto out; @@ -1639,7 +1642,6 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { l2tp_tunnel_inc_refcount(tunnel); - l2tp_tunnel_closeall(tunnel); if (false == queue_work(l2tp_wq, &tunnel->del_work)) { l2tp_tunnel_dec_refcount(tunnel); return 1; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 8f560f7140a0..aebf281d09ee 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -263,6 +263,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); +int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); /* Session reference counts. Incremented when code obtains a reference * to a session. diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e2c6ae024565..8bf18a5f66e0 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void l2tp_eth_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct l2tp_eth *priv = netdev_priv(dev); @@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, stats->rx_bytes = atomic_long_read(&priv->rx_bytes); stats->rx_packets = atomic_long_read(&priv->rx_packets); stats->rx_errors = atomic_long_read(&priv->rx_errors); - return stats; } - static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 8938b6ba57a0..d25038cfd64e 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/ioctls.h> #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -47,23 +48,32 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) return (struct l2tp_ip_sock *)sk; } -static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) +static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, + __be32 raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip_bind_table) { - struct inet_sock *inet = inet_sk(sk); - struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); + const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); + const struct inet_sock *inet = inet_sk(sk); - if (l2tp == NULL) + if (!net_eq(sock_net(sk), net)) continue; - if ((l2tp->conn_id == tunnel_id) && - net_eq(sock_net(sk), net) && - !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - (!sk->sk_bound_dev_if || !dif || - sk->sk_bound_dev_if == dif)) - goto found; + if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + continue; + + if (inet->inet_rcv_saddr && laddr && + inet->inet_rcv_saddr != laddr) + continue; + + if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) + continue; + + if (l2tp->conn_id != tunnel_id) + continue; + + goto found; } sk = NULL; @@ -71,15 +81,6 @@ found: return sk; } -static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) -{ - struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id); - if (sk) - sock_hold(sk); - - return sk; -} - /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header @@ -183,8 +184,8 @@ pass_up: struct iphdr *iph = (struct iphdr *) skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb), - tunnel_id); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, + inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip_lock); goto discard; @@ -265,7 +266,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!sock_flag(sk, SOCK_ZAPPED)) goto out; - if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) + if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); @@ -280,7 +281,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ write_lock_bh(&l2tp_ip_lock); - if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, + if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip_lock); ret = -EADDRINUSE; @@ -387,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); - return -1; + return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP @@ -560,6 +561,30 @@ out: return err ? err : copied; } +int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct sk_buff *skb; + int amount; + + switch (cmd) { + case SIOCOUTQ: + amount = sk_wmem_alloc_get(sk); + break; + case SIOCINQ: + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + amount = skb ? skb->len : 0; + spin_unlock_bh(&sk->sk_receive_queue.lock); + break; + + default: + return -ENOIOCTLCMD; + } + + return put_user(amount, (int __user *)arg); +} +EXPORT_SYMBOL(l2tp_ioctl); + static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, @@ -568,7 +593,7 @@ static struct proto l2tp_ip_prot = { .bind = l2tp_ip_bind, .connect = l2tp_ip_connect, .disconnect = l2tp_ip_disconnect, - .ioctl = udp_ioctl, + .ioctl = l2tp_ioctl, .destroy = l2tp_ip_destroy_sock, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f092ac441fdd..a4abcbc4c09a 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -57,25 +57,36 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) return (struct l2tp_ip6_sock *)sk; } -static struct sock *__l2tp_ip6_bind_lookup(struct net *net, - struct in6_addr *laddr, +static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, + const struct in6_addr *laddr, + const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip6_bind_table) { - const struct in6_addr *addr = inet6_rcv_saddr(sk); - struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); + const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); + const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; + const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); - if (l2tp == NULL) + if (!net_eq(sock_net(sk), net)) continue; - if ((l2tp->conn_id == tunnel_id) && - net_eq(sock_net(sk), net) && - (!addr || ipv6_addr_equal(addr, laddr)) && - (!sk->sk_bound_dev_if || !dif || - sk->sk_bound_dev_if == dif)) - goto found; + if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + continue; + + if (sk_laddr && !ipv6_addr_any(sk_laddr) && + !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) + continue; + + if (!ipv6_addr_any(sk_raddr) && raddr && + !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) + continue; + + if (l2tp->conn_id != tunnel_id) + continue; + + goto found; } sk = NULL; @@ -83,17 +94,6 @@ found: return sk; } -static inline struct sock *l2tp_ip6_bind_lookup(struct net *net, - struct in6_addr *laddr, - int dif, u32 tunnel_id) -{ - struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id); - if (sk) - sock_hold(sk); - - return sk; -} - /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header @@ -197,8 +197,8 @@ pass_up: struct ipv6hdr *iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb), - tunnel_id); + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, + inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip6_lock); goto discard; @@ -330,7 +330,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) rcu_read_unlock(); write_lock_bh(&l2tp_ip6_lock); - if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if, + if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip6_lock); err = -EADDRINUSE; @@ -658,7 +658,8 @@ out: return err < 0 ? err : len; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; @@ -730,7 +731,7 @@ static struct proto l2tp_ip6_prot = { .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, - .ioctl = udp_ioctl, + .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 3e821daf9dd4..8bc5a1bd2d45 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -821,7 +821,10 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) * another trick required to cope with how the PROCOM state * machine works. -acme */ + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; } if (!sock_owned_by_user(sk)) llc_conn_rcv(sk, skb); diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index d0e1e804ebd7..5404d0d195cc 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -290,7 +290,10 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; + skb_orphan(skb); + sock_hold(sk); skb->sk = sk; + skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 3891cbd2adea..76e30f4797fb 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -6,6 +6,7 @@ config MAC80211 select CRYPTO_AES select CRYPTO_CCM select CRYPTO_GCM + select CRYPTO_CMAC select CRC32 ---help--- This option enables the hardware independent IEEE 802.11 diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index d0bd5fff5f0a..2fb65588490c 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -22,126 +22,50 @@ #define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */ #define AAD_LEN 20 +static const u8 zero[CMAC_TLEN_256]; -void gf_mulx(u8 *pad) -{ - int i, carry; - - carry = pad[0] & 0x80; - for (i = 0; i < AES_BLOCK_SIZE - 1; i++) - pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7); - pad[AES_BLOCK_SIZE - 1] <<= 1; - if (carry) - pad[AES_BLOCK_SIZE - 1] ^= 0x87; -} - -void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem, - const u8 *addr[], const size_t *len, u8 *mac, - size_t mac_len) -{ - u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE]; - const u8 *pos, *end; - size_t i, e, left, total_len; - - memset(cbc, 0, AES_BLOCK_SIZE); - - total_len = 0; - for (e = 0; e < num_elem; e++) - total_len += len[e]; - left = total_len; - - e = 0; - pos = addr[0]; - end = pos + len[0]; - - while (left >= AES_BLOCK_SIZE) { - for (i = 0; i < AES_BLOCK_SIZE; i++) { - cbc[i] ^= *pos++; - if (pos >= end) { - e++; - pos = addr[e]; - end = pos + len[e]; - } - } - if (left > AES_BLOCK_SIZE) - crypto_cipher_encrypt_one(tfm, cbc, cbc); - left -= AES_BLOCK_SIZE; - } - - memset(pad, 0, AES_BLOCK_SIZE); - crypto_cipher_encrypt_one(tfm, pad, pad); - gf_mulx(pad); - - if (left || total_len == 0) { - for (i = 0; i < left; i++) { - cbc[i] ^= *pos++; - if (pos >= end) { - e++; - pos = addr[e]; - end = pos + len[e]; - } - } - cbc[left] ^= 0x80; - gf_mulx(pad); - } - - for (i = 0; i < AES_BLOCK_SIZE; i++) - pad[i] ^= cbc[i]; - crypto_cipher_encrypt_one(tfm, pad, pad); - memcpy(mac, pad, mac_len); -} - - -void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad, +void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, const u8 *data, size_t data_len, u8 *mic) { - const u8 *addr[3]; - size_t len[3]; - u8 zero[CMAC_TLEN]; + SHASH_DESC_ON_STACK(desc, tfm); + u8 out[AES_BLOCK_SIZE]; - memset(zero, 0, CMAC_TLEN); - addr[0] = aad; - len[0] = AAD_LEN; - addr[1] = data; - len[1] = data_len - CMAC_TLEN; - addr[2] = zero; - len[2] = CMAC_TLEN; + desc->tfm = tfm; - aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN); + crypto_shash_init(desc); + crypto_shash_update(desc, aad, AAD_LEN); + crypto_shash_update(desc, data, data_len - CMAC_TLEN); + crypto_shash_finup(desc, zero, CMAC_TLEN, out); + + memcpy(mic, out, CMAC_TLEN); } -void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, +void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, const u8 *data, size_t data_len, u8 *mic) { - const u8 *addr[3]; - size_t len[3]; - u8 zero[CMAC_TLEN_256]; + SHASH_DESC_ON_STACK(desc, tfm); - memset(zero, 0, CMAC_TLEN_256); - addr[0] = aad; - len[0] = AAD_LEN; - addr[1] = data; - len[1] = data_len - CMAC_TLEN_256; - addr[2] = zero; - len[2] = CMAC_TLEN_256; + desc->tfm = tfm; - aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256); + crypto_shash_init(desc); + crypto_shash_update(desc, aad, AAD_LEN); + crypto_shash_update(desc, data, data_len - CMAC_TLEN_256); + crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic); } -struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[], - size_t key_len) +struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], + size_t key_len) { - struct crypto_cipher *tfm; + struct crypto_shash *tfm; - tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash("cmac(aes)", 0, 0); if (!IS_ERR(tfm)) - crypto_cipher_setkey(tfm, key, key_len); + crypto_shash_setkey(tfm, key, key_len); return tfm; } - -void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm) +void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm) { - crypto_free_cipher(tfm); + crypto_free_shash(tfm); } diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h index c827e1d5de8b..fef531f42003 100644 --- a/net/mac80211/aes_cmac.h +++ b/net/mac80211/aes_cmac.h @@ -10,17 +10,14 @@ #define AES_CMAC_H #include <linux/crypto.h> +#include <crypto/hash.h> -void gf_mulx(u8 *pad); -void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem, - const u8 *addr[], const size_t *len, u8 *mac, - size_t mac_len); -struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[], - size_t key_len); -void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad, +struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], + size_t key_len); +void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, const u8 *data, size_t data_len, u8 *mic); -void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad, +void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, const u8 *data, size_t data_len, u8 *mic); -void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); +void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm); #endif /* AES_CMAC_H */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3b5fd4188f2a..4456559cb056 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -85,7 +85,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, ht_dbg(sta->sdata, "Rx BA session stop requested for %pM tid %u %s reason: %d\n", sta->sta.addr, tid, - initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator", + initiator == WLAN_BACK_RECIPIENT ? "recipient" : "initiator", (int)reason); if (drv_ampdu_action(local, sta->sdata, ¶ms)) @@ -398,6 +398,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; tid_agg_rx->auto_seq = auto_seq; + tid_agg_rx->started = false; tid_agg_rx->reorder_buf_filtered = 0; status = WLAN_STATUS_SUCCESS; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e91e503bf992..ac879bb17870 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -208,8 +208,8 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, if (changes & CFG80211_NAN_CONF_CHANGED_PREF) new_conf.master_pref = conf->master_pref; - if (changes & CFG80211_NAN_CONF_CHANGED_DUAL) - new_conf.dual = conf->dual; + if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) + new_conf.bands = conf->bands; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); if (!ret) @@ -3563,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, } EXPORT_SYMBOL(ieee80211_nan_func_match); +static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, + struct net_device *dev, + const bool enabled) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + sdata->u.ap.multicast_to_unicast = enabled; + + return 0; +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3653,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = { .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, + .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e75cbf6ecc26..89178b46b32f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -231,9 +231,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata) !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; - if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC)) - continue; - max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); } rcu_read_unlock(); @@ -1270,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata, *sdata_tmp; struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; struct ieee80211_chanctx *new_ctx = NULL; - int i, err, n_assigned, n_reserved, n_ready; + int err, n_assigned, n_reserved, n_ready; int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; lockdep_assert_held(&local->mtx); @@ -1391,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * Update all structures, values and pointers to point to new channel * context(s). */ - - i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e02ba42ca827..5fae001f286c 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -243,6 +243,38 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, return rv; } +static ssize_t misc_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */ + size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9; + char *buf; + char *pos, *end; + ssize_t rv; + int i; + int ln; + + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + pos = buf; + end = buf + bufsz - 1; + + pos += scnprintf(pos, end - pos, "pending:\n"); + + for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { + ln = skb_queue_len(&local->pending[i]); + pos += scnprintf(pos, end - pos, "[%i] %d\n", + i, ln); + } + + rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); + kfree(buf); + return rv; +} + static ssize_t queues_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -263,6 +295,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf, DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); +DEBUGFS_READONLY_FILE_OPS(misc); /* statistics stuff */ @@ -330,7 +363,9 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); + DEBUGFS_ADD(rate_ctrl_alg); DEBUGFS_ADD(queues); + DEBUGFS_ADD(misc); #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1a05f85cb1f0..8f5fff8b2040 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -519,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm( } IEEE80211_IF_FILE_R(aqm); +IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX); + /* IBSS attributes */ static ssize_t ieee80211_if_fmt_tsf( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) @@ -683,6 +685,7 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(dtim_count); DEBUGFS_ADD(num_buffered_multicast); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); + DEBUGFS_ADD_MODE(multicast_to_unicast, 0600); } static void add_vlan_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index f6003b8c2c33..42601820db20 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -522,6 +522,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) return; DEBUGFS_ADD(flags); + DEBUGFS_ADD(aid); DEBUGFS_ADD(num_ps_buf_frames); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c index ecfdd97758a3..3cfb1e2ab7ac 100644 --- a/net/mac80211/fils_aead.c +++ b/net/mac80211/fils_aead.c @@ -9,66 +9,58 @@ #include <crypto/aes.h> #include <crypto/algapi.h> +#include <crypto/hash.h> #include <crypto/skcipher.h> #include "ieee80211_i.h" #include "aes_cmac.h" #include "fils_aead.h" -static int aes_s2v(struct crypto_cipher *tfm, +static void gf_mulx(u8 *pad) +{ + u64 a = get_unaligned_be64(pad); + u64 b = get_unaligned_be64(pad + 8); + + put_unaligned_be64((a << 1) | (b >> 63), pad); + put_unaligned_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0), pad + 8); +} + +static int aes_s2v(struct crypto_shash *tfm, size_t num_elem, const u8 *addr[], size_t len[], u8 *v) { - u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE]; + u8 d[AES_BLOCK_SIZE], tmp[AES_BLOCK_SIZE] = {}; + SHASH_DESC_ON_STACK(desc, tfm); size_t i; - const u8 *data[2]; - size_t data_len[2], data_elems; + + desc->tfm = tfm; /* D = AES-CMAC(K, <zero>) */ - memset(tmp, 0, AES_BLOCK_SIZE); - data[0] = tmp; - data_len[0] = AES_BLOCK_SIZE; - aes_cmac_vector(tfm, 1, data, data_len, d, AES_BLOCK_SIZE); + crypto_shash_digest(desc, tmp, AES_BLOCK_SIZE, d); for (i = 0; i < num_elem - 1; i++) { /* D = dbl(D) xor AES_CMAC(K, Si) */ gf_mulx(d); /* dbl */ - aes_cmac_vector(tfm, 1, &addr[i], &len[i], tmp, - AES_BLOCK_SIZE); + crypto_shash_digest(desc, addr[i], len[i], tmp); crypto_xor(d, tmp, AES_BLOCK_SIZE); } + crypto_shash_init(desc); + if (len[i] >= AES_BLOCK_SIZE) { /* len(Sn) >= 128 */ - size_t j; - const u8 *pos; - /* T = Sn xorend D */ - - /* Use a temporary buffer to perform xorend on Sn (addr[i]) to - * avoid modifying the const input argument. - */ - data[0] = addr[i]; - data_len[0] = len[i] - AES_BLOCK_SIZE; - pos = addr[i] + data_len[0]; - for (j = 0; j < AES_BLOCK_SIZE; j++) - tmp[j] = pos[j] ^ d[j]; - data[1] = tmp; - data_len[1] = AES_BLOCK_SIZE; - data_elems = 2; + crypto_shash_update(desc, addr[i], len[i] - AES_BLOCK_SIZE); + crypto_xor(d, addr[i] + len[i] - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); } else { /* len(Sn) < 128 */ /* T = dbl(D) xor pad(Sn) */ gf_mulx(d); /* dbl */ - memset(tmp, 0, AES_BLOCK_SIZE); - memcpy(tmp, addr[i], len[i]); - tmp[len[i]] = 0x80; - crypto_xor(d, tmp, AES_BLOCK_SIZE); - data[0] = d; - data_len[0] = sizeof(d); - data_elems = 1; + crypto_xor(d, addr[i], len[i]); + d[len[i]] ^= 0x80; } /* V = AES-CMAC(K, T) */ - aes_cmac_vector(tfm, data_elems, data, data_len, v, AES_BLOCK_SIZE); + crypto_shash_finup(desc, d, AES_BLOCK_SIZE, v); return 0; } @@ -80,7 +72,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len, size_t len[], u8 *out) { u8 v[AES_BLOCK_SIZE]; - struct crypto_cipher *tfm; + struct crypto_shash *tfm; struct crypto_skcipher *tfm2; struct skcipher_request *req; int res; @@ -95,14 +87,14 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len, /* S2V */ - tfm = crypto_alloc_cipher("aes", 0, 0); + tfm = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* K1 for S2V */ - res = crypto_cipher_setkey(tfm, key, key_len); + res = crypto_shash_setkey(tfm, key, key_len); if (!res) res = aes_s2v(tfm, num_elem, addr, len, v); - crypto_free_cipher(tfm); + crypto_free_shash(tfm); if (res) return res; @@ -124,7 +116,7 @@ static int aes_siv_encrypt(const u8 *key, size_t key_len, /* CTR */ - tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0); + tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm2)) { kfree(tmp); return PTR_ERR(tfm2); @@ -157,7 +149,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len, size_t num_elem, const u8 *addr[], size_t len[], u8 *out) { - struct crypto_cipher *tfm; + struct crypto_shash *tfm; struct crypto_skcipher *tfm2; struct skcipher_request *req; struct scatterlist src[1], dst[1]; @@ -183,7 +175,7 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len, /* CTR */ - tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, 0); + tfm2 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm2)) return PTR_ERR(tfm2); /* K2 for CTR */ @@ -210,14 +202,14 @@ static int aes_siv_decrypt(const u8 *key, size_t key_len, /* S2V */ - tfm = crypto_alloc_cipher("aes", 0, 0); + tfm = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* K1 for S2V */ - res = crypto_cipher_setkey(tfm, key, key_len); + res = crypto_shash_setkey(tfm, key, key_len); if (!res) res = aes_s2v(tfm, num_elem, addr, len, check); - crypto_free_cipher(tfm); + crypto_free_shash(tfm); if (res) return res; if (memcmp(check, frame_iv, AES_BLOCK_SIZE) != 0) @@ -272,7 +264,7 @@ int fils_encrypt_assoc_req(struct sk_buff *skb, crypt_len = skb->data + skb->len - encr; skb_put(skb, AES_BLOCK_SIZE); return aes_siv_encrypt(assoc_data->fils_kek, assoc_data->fils_kek_len, - encr, crypt_len, 1, addr, len, encr); + encr, crypt_len, 5, addr, len, encr); } int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index a31d30713d08..98999d3d5262 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -487,14 +487,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct beacon_data *presp, *old_presp; struct cfg80211_bss *cbss; const struct cfg80211_bss_ies *ies; - u16 capability = 0; + u16 capability = WLAN_CAPABILITY_IBSS; u64 tsf; int ret = 0; sdata_assert_lock(sdata); if (ifibss->privacy) - capability = WLAN_CAPABILITY_PRIVACY; + capability |= WLAN_CAPABILITY_PRIVACY; cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b2069fbd60f9..159a1a733725 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -297,6 +297,7 @@ struct ieee80211_if_ap { driver_smps_mode; /* smps mode request */ struct work_struct request_smps_work; + bool multicast_to_unicast; }; struct ieee80211_if_wds { @@ -624,8 +625,8 @@ struct ieee80211_mesh_sync_ops { struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ - void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata, - struct beacon_data *beacon); + void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon); /* add other framework functions here */ }; @@ -688,7 +689,6 @@ struct ieee80211_if_mesh { const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; - bool adjusting_tbtt; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 41497b670e2b..40813dd3301c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -6,6 +6,7 @@ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1122,7 +1123,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } -static struct rtnl_link_stats64 * +static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; @@ -1147,8 +1148,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->rx_bytes += rx_bytes; stats->tx_bytes += tx_bytes; } - - return stats; } static const struct net_device_ops ieee80211_dataif_ops = { @@ -1295,6 +1294,26 @@ static void ieee80211_iface_work(struct work_struct *work) } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_OPMODE_NOTIF: { + struct ieee80211_rx_status *status; + enum nl80211_band band; + u8 opmode; + + status = IEEE80211_SKB_RXCB(skb); + band = status->band; + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + + if (sta) + ieee80211_vht_handle_opmode(sdata, sta, + opmode, + band); + + mutex_unlock(&local->sta_mtx); + break; + } case WLAN_VHT_ACTION_GROUPID_MGMT: ieee80211_process_mu_groups(sdata, mgmt); break; diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 4aa20cef0859..ebdb80b85dc3 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -93,7 +93,7 @@ struct ieee80211_key { } ccmp; struct { u8 rx_pn[IEEE80211_CMAC_PN_LEN]; - struct crypto_cipher *tfm; + struct crypto_shash *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_cmac; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1822c77f2b1c..56fb47953b72 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -913,12 +913,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; - if (sband->ht_cap.ht_supported) - local->rx_chains = - max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs), - local->rx_chains); + if (!sband->ht_cap.ht_supported) + continue; /* TODO: consider VHT for RX chains, hopefully it's the same */ + local->rx_chains = + max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs), + local->rx_chains); + + /* no need to mask, SM_PS_DISABLED has all bits set */ + sband->ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; } /* if low-level driver supports AP, we also support VLAN */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 42120d965263..6e7b6a07b7d5 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ *pos |= ifmsh->ps_peers_deep_sleep ? IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; - *pos++ |= ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00; - *pos++ = 0x00; - return 0; } @@ -339,7 +335,7 @@ int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata, /* fast-forward to vendor IEs */ offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0); - if (offset) { + if (offset < ifmsh->ie_len) { len = ifmsh->ie_len - offset; data = ifmsh->ie + offset; if (skb_tailroom(skb) < len) @@ -685,7 +681,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + /* NULL SSID */ /* Channel Switch Announcement */ 2 + sizeof(struct ieee80211_channel_sw_ie) + - /* Mesh Channel Swith Parameters */ + /* Mesh Channel Switch Parameters */ 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) + 2 + 8 + /* supported rates */ 2 + 3; /* DS params */ @@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ifmsh->mesh_cc_id = 0; /* Disabled */ /* register sync ops from extensible synchronization framework */ ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id); - ifmsh->adjusting_tbtt = false; ifmsh->sync_offset_clockdrift_max = 0; set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); ieee80211_mesh_root_setup(ifmsh); @@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) ieee80211_mesh_rootpath(sdata); if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags)) - mesh_sync_adjust_tbtt(sdata); + mesh_sync_adjust_tsf(sdata); if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags)) mesh_bss_info_changed(sdata); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 26b9ccbe1fce..7e5f271e3c30 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) } void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); -void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); +void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata); void ieee80211s_stop(void); #else static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 7fcdcf622655..fcba70e57073 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -505,12 +505,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, /* Userspace handles station allocation */ if (sdata->u.mesh.user_mpm || - sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) - cfg80211_notify_new_peer_candidate(sdata->dev, addr, - elems->ie_start, - elems->total_len, - GFP_KERNEL); - else + sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { + if (mesh_peer_accepts_plinks(elems) && + mesh_plink_availables(sdata)) + cfg80211_notify_new_peer_candidate(sdata->dev, addr, + elems->ie_start, + elems->total_len, + GFP_KERNEL); + } else sta = __mesh_sta_info_alloc(sdata, addr); return sta; diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index faca22cd02b5..a435f094a82e 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -12,7 +12,7 @@ #include "mesh.h" #include "driver-ops.h" -/* This is not in the standard. It represents a tolerable tbtt drift below +/* This is not in the standard. It represents a tolerable tsf drift below * which we do no TSF adjustment. */ #define TOFFSET_MINIMUM_ADJUSTMENT 10 @@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie) IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; } -void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) +void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) spin_lock_bh(&ifmsh->sync_offset_lock); if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { - msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", + msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n", (long long) ifmsh->sync_offset_clockdrift_max); tsfdelta = -ifmsh->sync_offset_clockdrift_max; ifmsh->sync_offset_clockdrift_max = 0; } else { - msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n", + msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n", (long long) ifmsh->sync_offset_clockdrift_max, (unsigned long long) beacon_int_fraction); tsfdelta = -beacon_int_fraction; @@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, */ if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { - clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", sta->sta.addr); goto no_sync; @@ -168,15 +167,13 @@ no_sync: rcu_read_unlock(); } -static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, +static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u8 cap; WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); WARN_ON(!rcu_read_lock_held()); - cap = beacon->meshconf->meshconf_cap; spin_lock_bh(&ifmsh->sync_offset_lock); @@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, * the tsf adjustment to the mesh tasklet */ msync_dbg(sdata, - "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", + "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n", ifmsh->sync_offset_clockdrift_max); set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); - - ifmsh->adjusting_tbtt = true; } else { msync_dbg(sdata, - "TBTT : max clockdrift=%lld; too small to adjust\n", + "TSF : max clockdrift=%lld; too small to adjust\n", (long long)ifmsh->sync_offset_clockdrift_max); ifmsh->sync_offset_clockdrift_max = 0; - - ifmsh->adjusting_tbtt = false; } spin_unlock_bh(&ifmsh->sync_offset_lock); - - beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap : - ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap; } static const struct sync_method sync_methods[] = { @@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = { .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, .ops = { .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp, - .adjust_tbtt = &mesh_sync_offset_adjust_tbtt, + .adjust_tsf = &mesh_sync_offset_adjust_tsf, } }, }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 098ce9b179ee..6e90301154d5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1486,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local) if (count == 1 && ieee80211_powersave_allowed(found)) { u8 dtimper = found->u.mgd.dtim_period; - s32 beaconint_us; - - beaconint_us = ieee80211_tu_to_usec( - found->vif.bss_conf.beacon_int); timeout = local->dynamic_ps_forced_timeout; if (timeout < 0) @@ -3423,14 +3419,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, - GFP_KERNEL); + sig, GFP_KERNEL); } else if (sig > thold && (last_event == 0 || sig > last_event + hyst)) { ifmgd->last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, - GFP_KERNEL); + sig, GFP_KERNEL); } } @@ -5045,13 +5041,14 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, + s32 rssi_level, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - trace_api_cqm_rssi_notify(sdata, rssi_event); + trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level); - cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp); + cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp); } EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 28a3a0957c9e..76a8bcd8ef11 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -168,6 +168,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) break; } + flush_delayed_work(&sdata->dec_tailroom_needed_wk); drv_remove_interface(local, sdata); } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 14c5ba3a1b1c..3ebe4405a2d4 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) { + unsigned int cur_prob; + if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; - mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); + cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = mrs->cur_prob; + mrs->prob_ewma = cur_prob; } else { /* update exponential weighted moving variance */ - mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd, - mrs->cur_prob, - mrs->prob_ewma, - EWMA_LEVEL); + mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv, + cur_prob, + mrs->prob_ewma, + EWMA_LEVEL); /*update exponential weighted moving avarage */ mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - mrs->cur_prob, + cur_prob, EWMA_LEVEL); } mrs->att_hist += mrs->attempts; @@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, return; #endif + /* Don't use EAPOL frames for sampling on non-mrr hw */ + if (mp->hw->max_rates == 1 && + (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) + return; + delta = (mi->total_packets * sampling_ratio / 100) - (mi->sample_packets + mi->sample_deferred / 2); diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index c230bbe93262..be6c3f35f48b 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -14,7 +14,7 @@ #define SAMPLE_COLUMNS 10 /* number of columns in sample table */ /* scaled fraction values */ -#define MINSTREL_SCALE 16 +#define MINSTREL_SCALE 12 #define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) #define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) @@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight) } /* - * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation + * Perform EWMV (Exponentially Weighted Moving Variance) calculation */ static inline int -minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight) +minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight) { - int diff, incr, tmp_var; + int diff, incr; - /* calculate exponential weighted moving variance */ - diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000); + diff = cur_prob - prob_ewma; incr = (EWMA_DIV - weight) * diff / EWMA_DIV; - tmp_var = old_ewmsd * old_ewmsd; - tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV; - - /* return standard deviation */ - return (u16) int_sqrt(tmp_var); + return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV; } struct minstrel_rate_stats { @@ -59,15 +54,13 @@ struct minstrel_rate_stats { u16 success, last_success; /* total attempts/success counters */ - u64 att_hist, succ_hist; + u32 att_hist, succ_hist; /* statistis of packet delivery probability - * cur_prob - current prob within last update intervall * prob_ewma - exponential weighted moving average of prob * prob_ewmsd - exp. weighted moving standard deviation of prob */ - unsigned int cur_prob; - unsigned int prob_ewma; - u16 prob_ewmsd; + u16 prob_ewma; + u16 prob_ewmv; /* maximum retry counts */ u8 retry_count; @@ -153,6 +146,14 @@ struct minstrel_debugfs_info { char buf[]; }; +/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */ +static inline int +minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs) +{ + unsigned int ewmv = mrs->prob_ewmv; + return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000)); +} + extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 820b0abc9c0d..36fc971deb86 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) { struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, prob, eprob; + unsigned int i, tp_max, tp_avg, eprob; char *p; ms = kmalloc(2048, GFP_KERNEL); @@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file) p = ms->buf; p += sprintf(p, "\n"); p += sprintf(p, - "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); + "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n"); p += sprintf(p, - "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); + "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; + unsigned int prob_ewmsd; *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" - " %3u.%1u %3u %3u %-3u " + " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) { struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, prob, eprob; + unsigned int i, tp_max, tp_avg, eprob; char *p; ms = kmalloc(2048, GFP_KERNEL); @@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; + unsigned int prob_ewmsd; p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); @@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 30fbabf4bcbc..8e783e197e93 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -14,6 +14,7 @@ #include <linux/ieee80211.h> #include <net/mac80211.h> #include "rate.h" +#include "sta_info.h" #include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" @@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only, const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(1, 0, BW_20), MCS_GROUP(2, 0, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 0, BW_20), -#endif MCS_GROUP(1, 1, BW_20), MCS_GROUP(2, 1, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 1, BW_20), -#endif MCS_GROUP(1, 0, BW_40), MCS_GROUP(2, 0, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 0, BW_40), -#endif MCS_GROUP(1, 1, BW_40), MCS_GROUP(2, 1, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 1, BW_40), -#endif CCK_GROUP, #ifdef CONFIG_MAC80211_RC_MINSTREL_VHT VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_20), -#endif VHT_GROUP(1, 1, BW_20), VHT_GROUP(2, 1, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_20), -#endif VHT_GROUP(1, 0, BW_40), VHT_GROUP(2, 0, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_40), -#endif VHT_GROUP(1, 1, BW_40), VHT_GROUP(2, 1, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_40), -#endif VHT_GROUP(1, 0, BW_80), VHT_GROUP(2, 0, BW_80), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_80), -#endif VHT_GROUP(1, 1, BW_80), VHT_GROUP(2, 1, BW_80), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_80), #endif -#endif }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; @@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, break; /* short preamble */ - if (!(mi->groups[group].supported & BIT(idx))) + if (!(mi->supported[group] & BIT(idx))) idx += 4; } return &mi->groups[group].rates[idx]; @@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) MCS_GROUP_RATES].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; - if (!mg->supported || group == MINSTREL_CCK_GROUP) + if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; @@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; - if (!mg->supported) + if (!mi->supported[group]) continue; mi->sample_count++; @@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) tmp_group_tp_rate[j] = group; for (i = 0; i < MCS_GROUP_RATES; i++) { - if (!(mg->supported & BIT(i))) + if (!(mi->supported[group] & BIT(i))) continue; index = MCS_GROUP_RATES * group + i; @@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi) mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); mg = &mi->groups[mi->sample_group]; - if (!mg->supported) + if (!mi->supported[mi->sample_group]) continue; if (++mg->index >= MCS_GROUP_RATES) { @@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) while (group > 0) { group--; - if (!mi->groups[group].supported) + if (!mi->supported[group]) continue; if (minstrel_mcs_groups[group].streams > @@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) sample_idx = sample_table[mg->column][mg->index]; minstrel_set_next_sample_idx(mi); - if (!(mg->supported & BIT(sample_idx))) + if (!(mi->supported[sample_group] & BIT(sample_idx))) return -1; mrs = &mg->rates[sample_idx]; @@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) } static void -minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp, - struct minstrel_ht_sta *mi, bool val) -{ - u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported; - - if (!supported || !mi->cck_supported_short) - return; - - if (supported & (mi->cck_supported_short << (val * 4))) - return; - - supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4); - mi->groups[MINSTREL_CCK_GROUP].supported = supported; -} - -static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { @@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, minstrel_aggr_check(sta, txrc->skb); info->flags |= mi->tx_flags; - minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble); #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) @@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mi->cck_supported_short |= BIT(i); } - mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported; + mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported; } static void @@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; u16 sta_cap = sta->ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + struct sta_info *sinfo = container_of(sta, struct sta_info, sta); int use_vht; int n_supported = 0; int ack_dur; @@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; - mi->groups[i].supported = 0; + mi->supported[i] = 0; if (i == MINSTREL_CCK_GROUP) { minstrel_ht_update_cck(mp, mi, sband, sta); continue; @@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (use_vht && minstrel_vht_only) continue; #endif - mi->groups[i].supported = mcs->rx_mask[nss - 1]; - if (mi->groups[i].supported) + mi->supported[i] = mcs->rx_mask[nss - 1]; + if (mi->supported[i]) n_supported++; continue; } @@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, else bw = BW_20; - mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss, + mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss, vht_cap->vht_mcs.tx_mcs_map); - if (mi->groups[i].supported) + if (mi->supported[i]) n_supported++; } if (!n_supported) goto use_legacy; + if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE)) + mi->cck_supported_short |= mi->cck_supported_short << 4; + /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index e8b52a94d24b..de1646c42e82 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -52,9 +52,6 @@ struct minstrel_mcs_group_data { u8 index; u8 column; - /* bitfield of supported MCS rates of this group */ - u16 supported; - /* sorted rate set within a MCS group*/ u16 max_group_tp_rate[MAX_THR_RATES]; u16 max_group_prob_rate; @@ -101,6 +98,9 @@ struct minstrel_ht_sta { u8 cck_supported; u8 cck_supported_short; + /* Bitfield of supported MCS rates of all groups */ + u16 supported[MINSTREL_GROUPS_NB]; + /* MCS rate group info and statistics */ struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB]; }; diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5320e35ed3d0..7d969e300fb3 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -19,12 +19,12 @@ static char * minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) { const struct mcs_group *mg; - unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; + unsigned int j, tp_max, tp_avg, eprob, tx_time; char htmode = '2'; char gimode = 'L'; u32 gflags; - if (!mi->groups[i].supported) + if (!mi->supported[i]) return p; mg = &minstrel_mcs_groups[i]; @@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; + unsigned int prob_ewmsd; - if (!(mi->groups[i].supported & BIT(j))) + if (!(mi->supported[i] & BIT(j))) continue; if (gflags & IEEE80211_TX_RC_MCS) { @@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" - " %3u.%1u %3u %3u %-3u " + " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "\n"); p += sprintf(p, - " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); + " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n"); p += sprintf(p, - "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) @@ -165,12 +165,12 @@ static char * minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) { const struct mcs_group *mg; - unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; + unsigned int j, tp_max, tp_avg, eprob, tx_time; char htmode = '2'; char gimode = 'L'; u32 gflags; - if (!mi->groups[i].supported) + if (!mi->supported[i]) return p; mg = &minstrel_mcs_groups[i]; @@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; + unsigned int prob_ewmsd; - if (!(mi->groups[i].supported & BIT(j))) + if (!(mi->supported[i] & BIT(j))) continue; if (gflags & IEEE80211_TX_RC_MCS) { @@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3e289a64ed43..e48724a6725e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright(c) 2015 - 2016 Intel Deutschland GmbH + * Copyright(c) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1034,6 +1034,18 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; + /* + * If the current MPDU's SN is smaller than the SSN, it shouldn't + * be reordered. + */ + if (unlikely(!tid_agg_rx->started)) { + if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { + ret = false; + goto out; + } + tid_agg_rx->started = true; + } + /* frame with out of date sequence number */ if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); @@ -1391,7 +1403,7 @@ EXPORT_SYMBOL(ieee80211_sta_pspoll); void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - u8 ac = ieee802_1d_to_ac[tid & 7]; + int ac = ieee80211_ac_from_tid(tid); /* * If this AC is not trigger-enabled do nothing unless the @@ -1908,7 +1920,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) unsigned int frag, seq; struct ieee80211_fragment_entry *entry; struct sk_buff *skb; - struct ieee80211_rx_status *status; hdr = (struct ieee80211_hdr *)rx->skb->data; fc = hdr->frame_control; @@ -2034,9 +2045,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) dev_kfree_skb(skb); } - /* Complete frame has been reassembled - process it now */ - status = IEEE80211_SKB_RXCB(rx->skb); - out: ieee80211_led_rx(rx->local); out_no_led: @@ -2472,7 +2480,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (!ifmsh->mshcfg.dot11MeshForwarding) goto out; - fwd_skb = skb_copy_expand(skb, local->tx_headroom, 0, GFP_ATOMIC); + fwd_skb = skb_copy_expand(skb, local->tx_headroom + + sdata->encrypt_headroom, 0, GFP_ATOMIC); if (!fwd_skb) { net_info_ratelimited("%s: failed to clone mesh frame\n", sdata->name); @@ -2880,17 +2889,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.u.vht_opmode_notif.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { - u8 opmode; - /* verify opmode is present */ if (len < IEEE80211_MIN_ACTION_SIZE + 2) goto invalid; - - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; - - ieee80211_vht_handle_opmode(rx->sdata, rx->sta, - opmode, status->band); - goto handled; + goto queue; } case WLAN_VHT_ACTION_GROUPID_MGMT: { if (len < IEEE80211_MIN_ACTION_SIZE + 25) @@ -3890,6 +3892,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, stats->last_rate = sta_stats_encode_rate(status); stats->fragments++; + stats->packets++; if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; @@ -3942,21 +3945,31 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u64_stats_update_end(&stats->syncp); if (fast_rx->internal_forward) { - struct sta_info *dsta = sta_info_get(rx->sdata, skb->data); + struct sk_buff *xmit_skb = NULL; + bool multicast = is_multicast_ether_addr(skb->data); - if (dsta) { + if (multicast) { + xmit_skb = skb_copy(skb, GFP_ATOMIC); + } else if (sta_info_get(rx->sdata, skb->data)) { + xmit_skb = skb; + skb = NULL; + } + + if (xmit_skb) { /* * Send to wireless media and increase priority by 256 * to keep the received priority instead of * reclassifying the frame (see cfg80211_classify8021d). */ - skb->priority += 256; - skb->protocol = htons(ETH_P_802_3); - skb_reset_network_header(skb); - skb_reset_mac_header(skb); - dev_queue_xmit(skb); - return true; + xmit_skb->priority += 256; + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); } + + if (!skb) + return true; } /* deliver to local stack */ @@ -4073,15 +4086,17 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_is_beacon(hdr->frame_control))) ieee80211_scan_rx(local, skb); - if (pubsta) { - rx.sta = container_of(pubsta, struct sta_info, sta); - rx.sdata = rx.sta->sdata; - if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) - return; - goto out; - } else if (ieee80211_is_data(fc)) { + if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; + if (pubsta) { + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + goto out; + } + prev_sta = NULL; for_each_sta_info(local, hdr->addr2, sta, tmp) { diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 23d8ac829279..faab3c490d2b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; - size_t len; iebufsz = local->scan_ies_len + req->ie_len; @@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, - &sched_scan_ies, req->ie, - req->ie_len, bands_used, - rate_masks, &chandef); + ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + &sched_scan_ies, req->ie, + req->ie_len, bands_used, rate_masks, &chandef); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index b6cfcf038c11..3323a2fb289b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - struct station_info *sinfo; + struct station_info *sinfo = NULL; int err = 0; lockdep_assert_held(&local->sta_mtx); - sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); - if (!sinfo) { - err = -ENOMEM; - goto out_err; - } - /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_err; } + sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); + if (!sinfo) { + err = -ENOMEM; + goto out_err; + } + local->num_sta++; local->sta_generation++; smp_mb(); @@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) @@ -1264,7 +1264,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) sta_info_recalc_tim(sta); ps_dbg(sdata, - "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", + "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); @@ -1501,8 +1501,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) - if (ignored_acs & BIT(ac)) - continue; + if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) + break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); @@ -2051,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - struct rate_control_ref *ref = NULL; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); - if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - ref = local->rate_ctrl; - sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index dd06ef0b8861..15599c70a38f 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -189,6 +189,7 @@ struct tid_ampdu_tx { * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and * and ssn. * @removed: this session is removed (but might have been found due to RCU) + * @started: this session has started (head ssn or higher was received) * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -212,8 +213,9 @@ struct tid_ampdu_rx { u16 ssn; u16 buf_size; u16 timeout; - bool auto_seq; - bool removed; + u8 auto_seq:1, + removed:1, + started:1; }; /** diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ddf71c648cab..83b8b11f24ea 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -51,7 +51,8 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct ieee80211_hdr *hdr = (void *)skb->data; int ac; - if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) { + if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_CTL_AMPDU)) { ieee80211_free_txskb(&local->hw, skb); return; } @@ -95,7 +96,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; - ac = ieee802_1d_to_ac[tid & 7]; + ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } @@ -462,9 +463,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); - skb = idr_find(&local->ack_status_frames, info->ack_frame_id); - if (skb) - idr_remove(&local->ack_status_frames, info->ack_frame_id); + skb = idr_remove(&local->ack_status_frames, info->ack_frame_id); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) @@ -541,6 +540,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, } else if (info->ack_frame_id) { ieee80211_report_ack_skb(local, info, acked, dropped); } + + if (!dropped && skb->destructor) { + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; + } } /* @@ -633,10 +637,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_supported_band *sband; int retry_count; - int rates_idx; bool acked, noack_success; - rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); + ieee80211_tx_get_rates(hw, info, &retry_count); sband = hw->wiphy->bands[info->band]; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 92a47afaa989..0d645bc148d0 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1736,21 +1736,21 @@ TRACE_EVENT(drv_start_nan, LOCAL_ENTRY VIF_ENTRY __field(u8, master_pref) - __field(u8, dual) + __field(u8, bands) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->master_pref = conf->master_pref; - __entry->dual = conf->dual; + __entry->bands = conf->bands; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT - ", master preference: %u, dual: %d", + ", master preference: %u, bands: 0x%0x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, - __entry->dual + __entry->bands ) ); @@ -1787,7 +1787,7 @@ TRACE_EVENT(drv_nan_change_conf, LOCAL_ENTRY VIF_ENTRY __field(u8, master_pref) - __field(u8, dual) + __field(u8, bands) __field(u32, changes) ), @@ -1795,15 +1795,15 @@ TRACE_EVENT(drv_nan_change_conf, LOCAL_ASSIGN; VIF_ASSIGN; __entry->master_pref = conf->master_pref; - __entry->dual = conf->dual; + __entry->bands = conf->bands; __entry->changes = changes; ), TP_printk( LOCAL_PR_FMT VIF_PR_FMT - ", master preference: %u, dual: %d, changes: 0x%x", + ", master preference: %u, bands: 0x%0x, changes: 0x%x", LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, - __entry->dual, __entry->changes + __entry->bands, __entry->changes ) ); @@ -1996,23 +1996,26 @@ TRACE_EVENT(api_connection_loss, TRACE_EVENT(api_cqm_rssi_notify, TP_PROTO(struct ieee80211_sub_if_data *sdata, - enum nl80211_cqm_rssi_threshold_event rssi_event), + enum nl80211_cqm_rssi_threshold_event rssi_event, + s32 rssi_level), - TP_ARGS(sdata, rssi_event), + TP_ARGS(sdata, rssi_event, rssi_level), TP_STRUCT__entry( VIF_ENTRY __field(u32, rssi_event) + __field(s32, rssi_level) ), TP_fast_assign( VIF_ASSIGN; __entry->rssi_event = rssi_event; + __entry->rssi_level = rssi_level; ), TP_printk( - VIF_PR_FMT " event:%d", - VIF_PR_ARG, __entry->rssi_event + VIF_PR_FMT " event:%d rssi:%d", + VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2c21b7039136..ba8d7db0a071 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -16,6 +16,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/skbuff.h> +#include <linux/if_vlan.h> #include <linux/etherdevice.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> @@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_chanctx_conf *chanctx_conf; u32 rate_flags = 0; + /* assume HW handles this */ + if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) + return 0; + rcu_read_lock(); chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf); if (chanctx_conf) { @@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, } rcu_read_unlock(); - /* assume HW handles this */ - if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) - return 0; - /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; @@ -1243,7 +1244,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; @@ -1257,10 +1258,13 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, if (!ieee80211_is_data(hdr->frame_control)) return NULL; - if (pubsta) { + if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - txq = pubsta->txq[tid]; + if (!sta->uploaded) + return NULL; + + txq = sta->sta.txq[tid]; } else if (vif) { txq = vif->txq; } @@ -1410,7 +1414,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, txqi->txq.sta = &sta->sta; sta->sta.txq[tid] = &txqi->txq; txqi->txq.tid = tid; - txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; + txqi->txq.ac = ieee80211_ac_from_tid(tid); } else { sdata->vif.txq = &txqi->txq; txqi->txq.tid = 0; @@ -1503,23 +1507,17 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct fq *fq = &local->fq; struct ieee80211_vif *vif; struct txq_info *txqi; - struct ieee80211_sta *pubsta; if (!local->ops->wake_tx_queue || sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; - if (sta && sta->uploaded) - pubsta = &sta->sta; - else - pubsta = NULL; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); vif = &sdata->vif; - txqi = ieee80211_get_txq(local, vif, pubsta, skb); + txqi = ieee80211_get_txq(local, vif, sta, skb); if (!txqi) return false; @@ -3287,7 +3285,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); int hw_headroom = sdata->local->hw.extra_tx_headroom; struct ethhdr eth; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *info; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct ieee80211_tx_data tx; ieee80211_tx_result r; @@ -3351,6 +3349,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); + info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->band = fast_tx->band; info->control.vif = &sdata->vif; @@ -3573,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, rcu_read_unlock(); } +static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta) +{ + struct ethhdr *eth; + int err; + + err = skb_ensure_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + eth = (void *)skb->data; + ether_addr_copy(eth->h_dest, sta->sta.addr); + + return 0; +} + +static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, + struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + const struct ethhdr *eth = (void *)skb->data; + const struct vlan_ethhdr *ethvlan = (void *)skb->data; + __be16 ethertype; + + if (likely(!is_multicast_ether_addr(eth->h_dest))) + return false; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + if (sdata->u.vlan.sta) + return false; + if (sdata->wdev.use_4addr) + return false; + /* fall through */ + case NL80211_IFTYPE_AP: + /* check runtime toggle for this bss */ + if (!sdata->bss->multicast_to_unicast) + return false; + break; + default: + return false; + } + + /* multicast to unicast conversion only for some payload */ + ethertype = eth->h_proto; + if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN) + ethertype = ethvlan->h_vlan_encapsulated_proto; + switch (ethertype) { + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + break; + default: + return false; + } + + return true; +} + +static void +ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev, + struct sk_buff_head *queue) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + const struct ethhdr *eth = (struct ethhdr *)skb->data; + struct sta_info *sta, *first = NULL; + struct sk_buff *cloned_skb; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata) + /* AP-VLAN mismatch */ + continue; + if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr))) + /* do not send back to source */ + continue; + if (!first) { + first = sta; + continue; + } + cloned_skb = skb_clone(skb, GFP_ATOMIC); + if (!cloned_skb) + goto multicast; + if (unlikely(ieee80211_change_da(cloned_skb, sta))) { + dev_kfree_skb(cloned_skb); + goto multicast; + } + __skb_queue_tail(queue, cloned_skb); + } + + if (likely(first)) { + if (unlikely(ieee80211_change_da(skb, first))) + goto multicast; + __skb_queue_tail(queue, skb); + } else { + /* no STA connected, drop */ + kfree_skb(skb); + skb = NULL; + } + + goto out; +multicast: + __skb_queue_purge(queue); + __skb_queue_tail(queue, skb); +out: + rcu_read_unlock(); +} + /** * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs * @skb: packet to be sent @@ -3583,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { - __ieee80211_subif_start_xmit(skb, dev, 0); + if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { + struct sk_buff_head queue; + + __skb_queue_head_init(&queue); + ieee80211_convert_to_unicast(skb, dev, &queue); + while ((skb = __skb_dequeue(&queue))) + __ieee80211_subif_start_xmit(skb, dev, 0); + } else { + __ieee80211_subif_start_xmit(skb, dev, 0); + } + return NETDEV_TX_OK; } @@ -4076,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, } if (ifmsh->sync_ops) - ifmsh->sync_ops->adjust_tbtt(sdata, beacon); + ifmsh->sync_ops->adjust_tsf(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + @@ -4541,7 +4659,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, enum nl80211_band band) { - int ac = ieee802_1d_to_ac[tid & 7]; + int ac = ieee80211_ac_from_tid(tid); skb_reset_mac_header(skb); skb_set_queue_mapping(skb, ac); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 6832bf6ab69f..19ec2189d3ac 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; enum ieee80211_sta_rx_bandwidth new_bw; u32 changed = 0; u8 nss; - sband = local->hw.wiphy->bands[band]; - /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; @@ -527,8 +523,10 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); - if (changed > 0) + if (changed > 0) { + ieee80211_recalc_min_chandef(sdata); rate_control_rate_update(local, sband, sta, changed); + } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index efa3f48f1ec5..73e8f347802e 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ - if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) + if (!(status->flag & RX_FLAG_ICV_STRIPPED) && + pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_UNUSABLE; } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 8af6dd388d11..c1ef22df865f 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; /* Trim ICV */ - skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); + if (!(status->flag & RX_FLAG_ICV_STRIPPED)) + skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 15fe97644ffe..3818686182b2 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -7,7 +7,9 @@ #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/mpls.h> +#include <linux/netconf.h> #include <linux/vmalloc.h> +#include <linux/percpu.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> @@ -17,8 +19,8 @@ #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> -#include <net/addrconf.h> #endif +#include <net/addrconf.h> #include <net/nexthop.h> #include "internal.h" @@ -48,11 +50,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) return rt; } -static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) -{ - return rcu_dereference_rtnl(dev->mpls_ptr); -} - bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); @@ -98,18 +95,44 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); -static u32 mpls_multipath_hash(struct mpls_route *rt, - struct sk_buff *skb, bool bos) +void mpls_stats_inc_outucastpkts(struct net_device *dev, + const struct sk_buff *skb) +{ + struct mpls_dev *mdev; + + if (skb->protocol == htons(ETH_P_MPLS_UC)) { + mdev = mpls_dev_get(dev); + if (mdev) + MPLS_INC_STATS_LEN(mdev, skb->len, + tx_packets, + tx_bytes); + } else if (skb->protocol == htons(ETH_P_IP)) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct inet6_dev *in6dev = __in6_dev_get(dev); + + if (in6dev) + IP6_UPD_PO_STATS(dev_net(dev), in6dev, + IPSTATS_MIB_OUT, skb->len); +#endif + } +} +EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts); + +static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) { struct mpls_entry_decoded dec; + unsigned int mpls_hdr_len = 0; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; u32 hash = 0; - for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos; + for (label_index = 0; label_index < MAX_MP_SELECT_LABELS; label_index++) { - if (!pskb_may_pull(skb, sizeof(*hdr) * label_index)) + mpls_hdr_len += sizeof(*hdr); + if (!pskb_may_pull(skb, mpls_hdr_len)) break; /* Read and decode the current label */ @@ -134,37 +157,38 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, eli_seen = true; } - bos = dec.bos; - if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index + - sizeof(struct iphdr))) { + if (!dec.bos) + continue; + + /* found bottom label; does skb have room for a header? */ + if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) { const struct iphdr *v4hdr; - v4hdr = (const struct iphdr *)(mpls_hdr(skb) + - label_index); + v4hdr = (const struct iphdr *)(hdr + 1); if (v4hdr->version == 4) { hash = jhash_3words(ntohl(v4hdr->saddr), ntohl(v4hdr->daddr), v4hdr->protocol, hash); } else if (v4hdr->version == 6 && - pskb_may_pull(skb, sizeof(*hdr) * label_index + - sizeof(struct ipv6hdr))) { + pskb_may_pull(skb, mpls_hdr_len + + sizeof(struct ipv6hdr))) { const struct ipv6hdr *v6hdr; - v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) + - label_index); - + v6hdr = (const struct ipv6hdr *)(hdr + 1); hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); hash = jhash_1word(v6hdr->nexthdr, hash); } } + + break; } return hash; } static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, - struct sk_buff *skb, bool bos) + struct sk_buff *skb) { int alive = ACCESS_ONCE(rt->rt_nhn_alive); u32 hash = 0; @@ -180,7 +204,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, if (alive <= 0) return NULL; - hash = mpls_multipath_hash(rt, skb, bos); + hash = mpls_multipath_hash(rt, skb); nh_index = hash % alive; if (alive == rt->rt_nhn) goto out; @@ -253,6 +277,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct mpls_nh *nh; struct mpls_entry_decoded dec; struct net_device *out_dev; + struct mpls_dev *out_mdev; struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; @@ -262,56 +287,66 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_get(dev); - if (!mdev || !mdev->input_enabled) + if (!mdev) goto drop; - if (skb->pkt_type != PACKET_HOST) + MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets, + rx_bytes); + + if (!mdev->input_enabled) { + MPLS_INC_STATS(mdev, rx_dropped); goto drop; + } + + if (skb->pkt_type != PACKET_HOST) + goto err; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto drop; + goto err; if (!pskb_may_pull(skb, sizeof(*hdr))) - goto drop; + goto err; /* Read and decode the label */ hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); - /* Pop the label */ - skb_pull(skb, sizeof(*hdr)); - skb_reset_network_header(skb); - - skb_orphan(skb); - rt = mpls_route_input_rcu(net, dec.label); - if (!rt) + if (!rt) { + MPLS_INC_STATS(mdev, rx_noroute); goto drop; + } - nh = mpls_select_multipath(rt, skb, dec.bos); + nh = mpls_select_multipath(rt, skb); if (!nh) - goto drop; + goto err; - /* Find the output device */ - out_dev = rcu_dereference(nh->nh_dev); - if (!mpls_output_possible(out_dev)) - goto drop; + /* Pop the label */ + skb_pull(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + + skb_orphan(skb); if (skb_warn_if_lro(skb)) - goto drop; + goto err; skb_forward_csum(skb); /* Verify ttl is valid */ if (dec.ttl <= 1) - goto drop; + goto err; dec.ttl -= 1; + /* Find the output device */ + out_dev = rcu_dereference(nh->nh_dev); + if (!mpls_output_possible(out_dev)) + goto tx_err; + /* Verify the destination can hold the packet */ new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) - goto drop; + goto tx_err; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) @@ -319,7 +354,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) - goto drop; + goto tx_err; skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); @@ -327,7 +362,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(rt, skb, dec)) - goto drop; + goto err; } else { bool bos; int i; @@ -343,6 +378,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, } } + mpls_stats_inc_outucastpkts(out_dev, skb); + /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, @@ -355,6 +392,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, __func__, err); return 0; +tx_err: + out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + if (out_mdev) + MPLS_INC_STATS(out_mdev, tx_errors); + goto drop; +err: + MPLS_INC_STATS(mdev, rx_errors); drop: kfree_skb(skb); return NET_RX_DROP; @@ -853,15 +897,279 @@ errout: return err; } +static void mpls_get_stats(struct mpls_dev *mdev, + struct mpls_link_stats *stats) +{ + struct mpls_pcpu_stats *p; + int i; + + memset(stats, 0, sizeof(*stats)); + + for_each_possible_cpu(i) { + struct mpls_link_stats local; + unsigned int start; + + p = per_cpu_ptr(mdev->stats, i); + do { + start = u64_stats_fetch_begin(&p->syncp); + local = p->stats; + } while (u64_stats_fetch_retry(&p->syncp, start)); + + stats->rx_packets += local.rx_packets; + stats->rx_bytes += local.rx_bytes; + stats->tx_packets += local.tx_packets; + stats->tx_bytes += local.tx_bytes; + stats->rx_errors += local.rx_errors; + stats->tx_errors += local.tx_errors; + stats->rx_dropped += local.rx_dropped; + stats->tx_dropped += local.tx_dropped; + stats->rx_noroute += local.rx_noroute; + } +} + +static int mpls_fill_stats_af(struct sk_buff *skb, + const struct net_device *dev) +{ + struct mpls_link_stats *stats; + struct mpls_dev *mdev; + struct nlattr *nla; + + mdev = mpls_dev_get(dev); + if (!mdev) + return -ENODATA; + + nla = nla_reserve_64bit(skb, MPLS_STATS_LINK, + sizeof(struct mpls_link_stats), + MPLS_STATS_UNSPEC); + if (!nla) + return -EMSGSIZE; + + stats = nla_data(nla); + mpls_get_stats(mdev, stats); + + return 0; +} + +static size_t mpls_get_stats_af_size(const struct net_device *dev) +{ + struct mpls_dev *mdev; + + mdev = mpls_dev_get(dev); + if (!mdev) + return 0; + + return nla_total_size_64bit(sizeof(struct mpls_link_stats)); +} + +static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, + u32 portid, u32 seq, int event, + unsigned int flags, int type) +{ + struct nlmsghdr *nlh; + struct netconfmsg *ncm; + bool all = false; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), + flags); + if (!nlh) + return -EMSGSIZE; + + if (type == NETCONFA_ALL) + all = true; + + ncm = nlmsg_data(nlh); + ncm->ncm_family = AF_MPLS; + + if (nla_put_s32(skb, NETCONFA_IFINDEX, mdev->dev->ifindex) < 0) + goto nla_put_failure; + + if ((all || type == NETCONFA_INPUT) && + nla_put_s32(skb, NETCONFA_INPUT, + mdev->input_enabled) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mpls_netconf_msgsize_devconf(int type) +{ + int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; + + if (type == NETCONFA_ALL) + all = true; + + if (all || type == NETCONFA_INPUT) + size += nla_total_size(4); + + return size; +} + +static void mpls_netconf_notify_devconf(struct net *net, int type, + struct mpls_dev *mdev) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(mpls_netconf_msgsize_devconf(type), GFP_KERNEL); + if (!skb) + goto errout; + + err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF, + 0, type); + if (err < 0) { + /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); +} + +static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { + [NETCONFA_IFINDEX] = { .len = sizeof(int) }, +}; + +static int mpls_netconf_get_devconf(struct sk_buff *in_skb, + struct nlmsghdr *nlh) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NETCONFA_MAX + 1]; + struct netconfmsg *ncm; + struct net_device *dev; + struct mpls_dev *mdev; + struct sk_buff *skb; + int ifindex; + int err; + + err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, + devconf_mpls_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + if (!tb[NETCONFA_IFINDEX]) + goto errout; + + ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); + dev = __dev_get_by_index(net, ifindex); + if (!dev) + goto errout; + + mdev = mpls_dev_get(dev); + if (!mdev) + goto errout; + + err = -ENOBUFS; + skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); + if (!skb) + goto errout; + + err = mpls_netconf_fill_devconf(skb, mdev, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWNETCONF, 0, + NETCONFA_ALL); + if (err < 0) { + /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: + return err; +} + +static int mpls_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + struct net_device *dev; + struct mpls_dev *mdev; + int idx, s_idx; + int h, s_h; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + mdev = mpls_dev_get(dev); + if (!mdev) + goto cont; + if (mpls_netconf_fill_devconf(skb, mdev, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + NETCONFA_ALL) < 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) +static int mpls_conf_proc(struct ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int oval = *(int *)ctl->data; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write) { + struct mpls_dev *mdev = ctl->extra1; + int i = (int *)ctl->data - (int *)mdev; + struct net *net = ctl->extra2; + int val = *(int *)ctl->data; + + if (i == offsetof(struct mpls_dev, input_enabled) && + val != oval) { + mpls_netconf_notify_devconf(net, + NETCONFA_INPUT, + mdev); + } + } + + return ret; +} + static const struct ctl_table mpls_dev_table[] = { { .procname = "input", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, { } @@ -871,6 +1179,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + struct net *net = dev_net(dev); struct ctl_table *table; int i; @@ -881,8 +1190,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev, /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) + for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; + table[i].extra1 = mdev; + table[i].extra2 = net; + } snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); @@ -911,6 +1223,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; int err = -ENOMEM; + int i; ASSERT_RTNL(); @@ -918,19 +1231,40 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) if (!mdev) return ERR_PTR(err); + mdev->stats = alloc_percpu(struct mpls_pcpu_stats); + if (!mdev->stats) + goto free; + + for_each_possible_cpu(i) { + struct mpls_pcpu_stats *mpls_stats; + + mpls_stats = per_cpu_ptr(mdev->stats, i); + u64_stats_init(&mpls_stats->syncp); + } + err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; + mdev->dev = dev; rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; free: + free_percpu(mdev->stats); kfree(mdev); return ERR_PTR(err); } +static void mpls_dev_destroy_rcu(struct rcu_head *head) +{ + struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu); + + free_percpu(mdev->stats); + kfree(mdev); +} + static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; @@ -1045,7 +1379,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, if (mdev) { mpls_dev_sysctl_unregister(mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); - kfree_rcu(mdev, rcu); + call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } break; case NETDEV_CHANGENAME: @@ -1706,6 +2040,12 @@ static struct pernet_operations mpls_net_ops = { .exit = mpls_net_exit, }; +static struct rtnl_af_ops mpls_af_ops __read_mostly = { + .family = AF_MPLS, + .fill_stats_af = mpls_fill_stats_af, + .get_stats_af_size = mpls_get_stats_af_size, +}; + static int __init mpls_init(void) { int err; @@ -1722,9 +2062,13 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); + rtnl_af_register(&mpls_af_ops); + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); + rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, + mpls_netconf_dump_devconf, NULL); err = 0; out: return err; @@ -1738,6 +2082,7 @@ module_init(mpls_init); static void __exit mpls_exit(void) { rtnl_unregister_all(PF_MPLS); + rtnl_af_unregister(&mpls_af_ops); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index bdfef6c3271a..76360d8b9579 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -9,13 +9,58 @@ struct mpls_entry_decoded { u8 bos; }; +struct mpls_pcpu_stats { + struct mpls_link_stats stats; + struct u64_stats_sync syncp; +}; + struct mpls_dev { - int input_enabled; + int input_enabled; + struct net_device *dev; + struct mpls_pcpu_stats __percpu *stats; - struct ctl_table_header *sysctl; - struct rcu_head rcu; + struct ctl_table_header *sysctl; + struct rcu_head rcu; }; +#if BITS_PER_LONG == 32 + +#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ + do { \ + __typeof__(*(mdev)->stats) *ptr = \ + raw_cpu_ptr((mdev)->stats); \ + local_bh_disable(); \ + u64_stats_update_begin(&ptr->syncp); \ + ptr->stats.pkts_field++; \ + ptr->stats.bytes_field += (len); \ + u64_stats_update_end(&ptr->syncp); \ + local_bh_enable(); \ + } while (0) + +#define MPLS_INC_STATS(mdev, field) \ + do { \ + __typeof__(*(mdev)->stats) *ptr = \ + raw_cpu_ptr((mdev)->stats); \ + local_bh_disable(); \ + u64_stats_update_begin(&ptr->syncp); \ + ptr->stats.field++; \ + u64_stats_update_end(&ptr->syncp); \ + local_bh_enable(); \ + } while (0) + +#else + +#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ + do { \ + this_cpu_inc((mdev)->stats->stats.pkts_field); \ + this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \ + } while (0) + +#define MPLS_INC_STATS(mdev, field) \ + this_cpu_inc((mdev)->stats->stats.field) + +#endif + struct sk_buff; #define LABEL_NOT_SPECIFIED (1 << 20) @@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } +static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->mpls_ptr); +} + int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, @@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); +void mpls_stats_inc_outucastpkts(struct net_device *dev, + const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 2f7ccd934416..e4e4424f9eb1 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; + struct mpls_dev *out_mdev; int err = 0; bool bos; int i; unsigned int ttl; + /* Find the output device */ + out_dev = dst->dev; + /* Obtain the ttl */ if (dst->ops->family == AF_INET) { ttl = ip_hdr(skb)->ttl; @@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb) skb_orphan(skb); - /* Find the output device */ - out_dev = dst->dev; if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; @@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb) bos = false; } + mpls_stats_inc_outucastpkts(out_dev, skb); + if (rt) err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, skb); @@ -122,11 +126,14 @@ static int mpls_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_DONE; drop: + out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + if (out_mdev) + MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } -static int mpls_build_state(struct net_device *dev, struct nlattr *nla, +static int mpls_build_state(struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts) { @@ -215,6 +222,7 @@ static const struct lwtunnel_encap_ops mpls_iptun_ops = { .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, + .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 63729b489c2c..9b28864cc36a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -162,6 +162,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y + select LIBCRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -397,7 +398,6 @@ config NF_NAT_PROTO_SCTP bool default NF_NAT && NF_CT_PROTO_SCTP depends on NF_NAT && NF_CT_PROTO_SCTP - select LIBCRC32C config NF_NAT_AMANDA tristate @@ -467,10 +467,10 @@ config NF_TABLES_NETDEV This option enables support for the "netdev" table. config NFT_EXTHDR - tristate "Netfilter nf_tables IPv6 exthdr module" + tristate "Netfilter nf_tables exthdr module" help This option adds the "exthdr" expression that you can use to match - IPv6 extension headers. + IPv6 extension headers and tcp options. config NFT_META tristate "Netfilter nf_tables meta module" @@ -494,7 +494,7 @@ config NFT_CT depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" help - This option adds the "meta" expression that you can use to match + This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. config NFT_SET_RBTREE @@ -509,6 +509,12 @@ config NFT_SET_HASH This option adds the "hash" set type that is used to build one-way mappings between matchings and actions. +config NFT_SET_BITMAP + tristate "Netfilter nf_tables bitmap set module" + help + This option adds the "bitmap" set type that is used to build sets + whose keys are smaller or equal to 16 bits. + config NFT_COUNTER tristate "Netfilter nf_tables counter module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ca30d1960f1d..c9b78e7b342f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -7,7 +7,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o -nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -47,7 +46,6 @@ nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ # NAT protocols (nf_nat) nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o -nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o # generic transport layer logging obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o @@ -95,6 +93,7 @@ obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o +obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index ce6adfae521a..a87a6f8a74d8 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -375,7 +375,7 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { void (*attach)(struct sk_buff *, const struct sk_buff *); - if (skb->nfct) { + if (skb->_nfct) { rcu_read_lock(); attach = rcu_dereference(ip_ct_attach); if (attach) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1b05d4a7d5a1..f236c0bc7b3f 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -897,7 +897,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, continue; data = ahash_data(n, j, dsize); memcpy(tmp->value + k * dsize, data, dsize); - set_bit(j, tmp->used); + set_bit(k, tmp->used); k++; } tmp->pos = k; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 51077c53d76b..178d4eba013b 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -260,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, else prev = e; } + + /* If before/after is used on an empty set */ + if ((d->before > 0 && !next) || + (d->before < 0 && !prev)) + return -IPSET_ERR_REF_EXIST; + /* Re-add already existing element */ if (n) { - if ((d->before > 0 && !next) || - (d->before < 0 && !prev)) - return -IPSET_ERR_REF_EXIST; if (!flag_exist) return -IPSET_ERR_EXIST; /* Update extensions */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 096a45103f14..e6a2753dff9e 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void) "(size=%d, memory=%ldKbytes)\n", ip_vs_conn_tab_size, (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", sizeof(struct ip_vs_conn)); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 55e0169caa4c..5aeb0dde6ccc 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ipvs->ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + if (!svc && protocol == IPPROTO_TCP && + atomic_read(&ipvs->ftpsvc_counter) && + (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. @@ -711,7 +710,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, dest->vport == svc->port))) { /* HIT */ list_del(&dest->t_list); - ip_vs_dest_hold(dest); goto out; } } @@ -741,7 +739,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest) * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must - * be 0, so we simply release them here. + * be 1, so we simply release them here. */ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { @@ -1080,11 +1078,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); - /* dest lives in trash without reference */ + /* dest lives in trash with reference */ list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); - ip_vs_dest_put(dest); } @@ -1160,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data) spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { - if (atomic_read(&dest->refcnt) > 0) + if (atomic_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 6be5c538b71e..75f798f8e83b 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc) return -ENOMEM; svc->sched_data = s; - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); @@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc) /* release the table itself */ kfree_rcu(s, rcu_head); - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index cccf4d637412..5824927cf8e0 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) return -ENOMEM; svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* @@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) /* release the table itself */ kfree_rcu(tbl, rcu_head); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 796d70e47ddd..703f11877bee 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) return -ENOMEM; svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* @@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) /* release the table itself */ kfree_rcu(tbl, rcu_head); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 1e373a5e44e3..16aaac6eedc9 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc) return -ENOMEM; svc->sched_data = s; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); @@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc) /* release the table itself */ kfree_rcu(s, rcu_head); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 9350530c16c1..b03c28084f81 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, u16 mtu, min_mtu; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); if (!ipvs->sync_state) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3a073cd9fcf4..071b97fcbefb 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -85,11 +85,11 @@ static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 64u -/* upper bound of scan intervals */ -#define GC_INTERVAL_MAX (2 * HZ) -/* maximum conntracks to evict per gc run */ -#define GC_MAX_EVICTS 256u +#define GC_MAX_BUCKETS_DIV 128u +/* upper bound of full table scan */ +#define GC_MAX_SCAN_JIFFIES (16u * HZ) +/* desired ratio of entries found to be expired */ +#define GC_EVICT_RATIO 50u static struct conntrack_gc_work conntrack_gc_work; @@ -350,16 +350,31 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) spin_unlock(&pcpu->lock); } +#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) + /* Released via destroy_conntrack() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) { - struct nf_conn *tmpl; + struct nf_conn *tmpl, *p; - tmpl = kzalloc(sizeof(*tmpl), flags); - if (tmpl == NULL) - return NULL; + if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { + tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); + if (!tmpl) + return NULL; + + p = tmpl; + tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) { + tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; + } + } else { + tmpl = kzalloc(sizeof(*tmpl), flags); + if (!tmpl) + return NULL; + } tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); @@ -374,7 +389,11 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); - kfree(tmpl); + + if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) + kfree((char *)tmpl - tmpl->proto.tmpl_padto); + else + kfree(tmpl); } EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); @@ -686,12 +705,12 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, !nfct_nat(ct) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { - nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); - nf_conntrack_put(skb->nfct); - /* Assign conntrack already in hashes to this skbuff. Don't - * modify skb->nfctinfo to ensure consistent stateful filtering. - */ - skb->nfct = &ct->ct_general; + enum ip_conntrack_info oldinfo; + struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); + + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, oldinfo); return NF_ACCEPT; } NF_CT_STAT_INC(net, drop); @@ -938,6 +957,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash) static void gc_worker(struct work_struct *work) { + unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); unsigned int i, goal, buckets = 0, expired_count = 0; struct conntrack_gc_work *gc_work; unsigned int ratio, scanned = 0; @@ -979,8 +999,7 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched_rcu_qs(); - } while (++buckets < goal && - expired_count < GC_MAX_EVICTS); + } while (++buckets < goal); if (gc_work->exiting) return; @@ -997,27 +1016,25 @@ static void gc_worker(struct work_struct *work) * 1. Minimize time until we notice a stale entry * 2. Maximize scan intervals to not waste cycles * - * Normally, expired_count will be 0, this increases the next_run time - * to priorize 2) above. + * Normally, expire ratio will be close to 0. * - * As soon as a timed-out entry is found, move towards 1) and increase - * the scan frequency. - * In case we have lots of evictions next scan is done immediately. + * As soon as a sizeable fraction of the entries have expired + * increase scan frequency. */ ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio >= 90 || expired_count == GC_MAX_EVICTS) { - gc_work->next_gc_run = 0; - next_run = 0; - } else if (expired_count) { - gc_work->next_gc_run /= 2U; - next_run = msecs_to_jiffies(1); + if (ratio > GC_EVICT_RATIO) { + gc_work->next_gc_run = min_interval; } else { - if (gc_work->next_gc_run < GC_INTERVAL_MAX) - gc_work->next_gc_run += msecs_to_jiffies(1); + unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; + + BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); - next_run = gc_work->next_gc_run; + gc_work->next_gc_run += min_interval; + if (gc_work->next_gc_run > max) + gc_work->next_gc_run = max; } + next_run = gc_work->next_gc_run; gc_work->last_bucket = i; queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); } @@ -1025,7 +1042,7 @@ static void gc_worker(struct work_struct *work) static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = GC_INTERVAL_MAX; + gc_work->next_gc_run = HZ; gc_work->exiting = false; } @@ -1220,7 +1237,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } -/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */ static inline struct nf_conn * resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -1279,8 +1296,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } *set_reply = 0; } - skb->nfct = &ct->ct_general; - skb->nfctinfo = *ctinfo; + nf_ct_set(skb, ct, *ctinfo); return ct; } @@ -1288,7 +1304,7 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - struct nf_conn *ct, *tmpl = NULL; + struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -1298,14 +1314,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, int set_reply = 0; int ret; - if (skb->nfct) { + tmpl = nf_ct_get(skb, &ctinfo); + if (tmpl) { /* Previously seen (loopback or untracked)? Ignore. */ - tmpl = (struct nf_conn *)skb->nfct; if (!nf_ct_is_template(tmpl)) { NF_CT_STAT_INC_ATOMIC(net, ignore); return NF_ACCEPT; } - skb->nfct = NULL; + skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ @@ -1326,8 +1342,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, * inverse of the return code tells to the netfilter * core what to do with the packet. */ if (l4proto->error != NULL) { - ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, - pf, hooknum); + ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum); if (ret <= 0) { NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); @@ -1335,7 +1350,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } /* ICMP[v6] protocol trackers may assign one conntrack. */ - if (skb->nfct) + if (skb->_nfct) goto out; } repeat: @@ -1355,7 +1370,7 @@ repeat: goto out; } - NF_CT_ASSERT(skb->nfct); + NF_CT_ASSERT(skb_nfct(skb)); /* Decide what timeout policy we want to apply to this flow. */ timeouts = nf_ct_timeout_lookup(net, ct, l4proto); @@ -1365,8 +1380,8 @@ repeat: /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); - nf_conntrack_put(skb->nfct); - skb->nfct = NULL; + nf_conntrack_put(&ct->ct_general); + skb->_nfct = 0; NF_CT_STAT_INC_ATOMIC(net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(net, drop); @@ -1524,9 +1539,8 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) ctinfo = IP_CT_RELATED; /* Attach to new skbuff, and increment count */ - nskb->nfct = &ct->ct_general; - nskb->nfctinfo = ctinfo; - nf_conntrack_get(nskb->nfct); + nf_ct_set(nskb, ct, ctinfo); + nf_conntrack_get(skb_nfct(nskb)); } /* Bring out ya dead! */ @@ -1862,7 +1876,8 @@ int nf_conntrack_init_start(void) nf_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", - sizeof(struct nf_conn), 0, + sizeof(struct nf_conn), + NFCT_INFOMASK + 1, SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; @@ -1917,7 +1932,7 @@ int nf_conntrack_init_start(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX); + queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); return 0; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f8dbacf66795..4b2e1fb28bb4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp) } EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) +static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; @@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) add_timer(&exp->timeout); NF_CT_STAT_INC(net, expect_create); - return 0; } /* Race with expectations being used means we could have none to find; OK. */ @@ -411,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; - int ret = 1; + int ret = 0; if (!master_help) { ret = -ESHUTDOWN; @@ -461,15 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect); - if (ret <= 0) - goto out; - - ret = nf_ct_expect_insert(expect); if (ret < 0) goto out; + + nf_ct_expect_insert(expect); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); - return ret; + return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e3ed20060878..4aecef4a89fb 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen, { size_t i = plen; - pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); + pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen); if (dlen <= plen) { /* Short packet: try for partial? */ diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 7341adf7059d..6dc44d9b4190 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -188,6 +188,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); +static struct nf_conntrack_helper * +nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) +{ + if (!net->ct.sysctl_auto_assign_helper) { + if (net->ct.auto_assign_helper_warned) + return NULL; + if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) + return NULL; + pr_info("nf_conntrack: default automatic helper assignment " + "has been turned off for security reasons and CT-based " + " firewall rule not found. Use the iptables CT target " + "to attach helpers instead.\n"); + net->ct.auto_assign_helper_warned = 1; + return NULL; + } + + return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); +} + + int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { @@ -213,21 +233,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } help = nfct_help(ct); - if (net->ct.sysctl_auto_assign_helper && helper == NULL) { - helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - if (unlikely(!net->ct.auto_assign_helper_warned && helper)) { - pr_info("nf_conntrack: automatic helper " - "assignment is deprecated and it will " - "be removed soon. Use the iptables CT target " - "to attach helpers instead.\n"); - net->ct.auto_assign_helper_warned = true; - } - } if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; + helper = nf_ct_lookup_helper(ct, net); + if (helper == NULL) { + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; + } } if (help == NULL) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 27540455dc62..6806b5e73567 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1478,14 +1478,23 @@ static int ctnetlink_change_helper(struct nf_conn *ct, struct nlattr *helpinfo = NULL; int err; - /* don't change helper of sibling connections */ - if (ct->master) - return -EBUSY; - err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; + /* don't change helper of sibling connections */ + if (ct->master) { + /* If we try to change the helper to the same thing twice, + * treat the second attempt as a no-op instead of returning + * an error. + */ + if (help && help->helper && + !strcmp(help->helper->name, helpname)) + return 0; + else + return -EBUSY; + } + if (!strcmp(helpname, "")) { if (help && help->helper) { /* we had a helper before ... */ @@ -2270,6 +2279,30 @@ nla_put_failure: } static int +ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); + unsigned long d = ct->status ^ status; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EBUSY; + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EBUSY; + + /* This check is less strict than ctnetlink_change_status() + * because callers often flip IPS_EXPECTED bits when sending + * an NFQA_CT attribute to the kernel. So ignore the + * unchangeable bits but do not error out. + */ + ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | + (ct->status & IPS_UNCHANGEABLE_MASK); + return 0; +} + +static int ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; @@ -2280,7 +2313,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) return err; } if (cda[CTA_STATUS]) { - err = ctnetlink_change_status(ct, cda); + err = ctnetlink_update_status(ct, cda); if (err < 0) return err; } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b68ce6ac13b3..93dd1c5b7bff 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -561,7 +561,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, static int dccp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { struct dccp_hdr _dh, *dh; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index a0efde38da44..33279aab583d 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -22,7 +22,9 @@ #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> +#include <net/sctp/checksum.h> +#include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> @@ -505,6 +507,34 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } +static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, + unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + const struct sctphdr *sh; + struct sctphdr _sctph; + const char *logmsg; + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (!sh) { + logmsg = "nf_ct_sctp: short packet "; + goto out_invalid; + } + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + skb->ip_summed == CHECKSUM_NONE) { + if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { + logmsg = "nf_ct_sctp: bad CRC "; + goto out_invalid; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return NF_ACCEPT; +out_invalid: + if (LOG_INVALID(net, IPPROTO_SCTP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", logmsg); + return -NF_ACCEPT; +} + #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> @@ -752,6 +782,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .packet = sctp_packet, .get_timeouts = sctp_get_timeouts, .new = sctp_new, + .error = sctp_error, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = sctp_to_nlattr, @@ -786,6 +817,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .packet = sctp_packet, .get_timeouts = sctp_get_timeouts, .new = sctp_new, + .error = sctp_error, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = sctp_to_nlattr, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 69f687740c76..b122e9dacfed 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -750,7 +750,6 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| static int tcp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 20f35ed68030..f6ebce6178ca 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -108,8 +108,60 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +static int udplite_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + unsigned int cscov; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (!hdr) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: short packet "); + return -NF_ACCEPT; + } + + cscov = ntohs(hdr->len); + if (cscov == 0) { + cscov = udplen; + } else if (cscov < sizeof(*hdr) || cscov > udplen) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: invalid checksum coverage "); + return -NF_ACCEPT; + } + + /* UDPLITE mandates checksums */ + if (!hdr->check) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: checksum missing "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, + pf)) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: bad UDPLite checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} +#endif + static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info *ctinfo, + unsigned int dataoff, u_int8_t pf, unsigned int hooknum) { @@ -290,6 +342,41 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .allow_clash = true, + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .get_timeouts = udp_get_timeouts, + .new = udp_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); +#endif + struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = { .l3proto = PF_INET6, @@ -322,3 +409,38 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .get_net_proto = udp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); + +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .allow_clash = true, + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .get_timeouts = udp_get_timeouts, + .new = udp_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = udp_init_net, + .get_net_proto = udp_get_net_proto, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); +#endif diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c deleted file mode 100644 index c35f7bf05d8c..000000000000 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ /dev/null @@ -1,324 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2007 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/udp.h> -#include <linux/seq_file.h> -#include <linux/skbuff.h> -#include <linux/ipv6.h> -#include <net/ip6_checksum.h> -#include <net/checksum.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv6.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_log.h> - -static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { - [UDPLITE_CT_UNREPLIED] = 30*HZ, - [UDPLITE_CT_REPLIED] = 180*HZ, -}; - -static inline struct nf_udplite_net *udplite_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.udplite; -} - -static bool udplite_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) -{ - const struct udphdr *hp; - struct udphdr _hdr; - - /* Actually only need first 4 bytes to get ports. */ - hp = skb_header_pointer(skb, dataoff, 4, &_hdr); - if (hp == NULL) - return false; - - tuple->src.u.udp.port = hp->source; - tuple->dst.u.udp.port = hp->dest; - return true; -} - -static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return true; -} - -/* Print out the per-protocol part of the tuple. */ -static void udplite_print_tuple(struct seq_file *s, - const struct nf_conntrack_tuple *tuple) -{ - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.udp.port), - ntohs(tuple->dst.u.udp.port)); -} - -static unsigned int *udplite_get_timeouts(struct net *net) -{ - return udplite_pernet(net)->timeouts; -} - -/* Returns verdict for packet, and may modify conntracktype */ -static int udplite_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - u_int8_t pf, - unsigned int hooknum, - unsigned int *timeouts) -{ - /* If we've seen traffic both ways, this is some kind of UDP - stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDPLITE_CT_REPLIED]); - /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDPLITE_CT_UNREPLIED]); - } - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) -{ - return true; -} - -static int udplite_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info *ctinfo, - u_int8_t pf, - unsigned int hooknum) -{ - unsigned int udplen = skb->len - dataoff; - const struct udphdr *hdr; - struct udphdr _hdr; - unsigned int cscov; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hdr == NULL) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: short packet "); - return -NF_ACCEPT; - } - - cscov = ntohs(hdr->len); - if (cscov == 0) - cscov = udplen; - else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: invalid checksum coverage "); - return -NF_ACCEPT; - } - - /* UDPLITE mandates checksums */ - if (!hdr->check) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: checksum missing "); - return -NF_ACCEPT; - } - - /* Checksum invalid? Ignore. */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, - pf)) { - if (LOG_INVALID(net, IPPROTO_UDPLITE)) - nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, - "nf_ct_udplite: bad UDPLite checksum "); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeouts = data; - struct nf_udplite_net *un = udplite_pernet(net); - - /* set default timeouts for UDPlite. */ - timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED]; - timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED]; - - if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) { - timeouts[UDPLITE_CT_UNREPLIED] = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ; - } - if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) { - timeouts[UDPLITE_CT_REPLIED] = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ; - } - return 0; -} - -static int -udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeouts = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED, - htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) || - nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED, - htonl(timeouts[UDPLITE_CT_REPLIED] / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = { - [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 }, - [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table udplite_sysctl_table[] = { - { - .procname = "nf_conntrack_udplite_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "nf_conntrack_udplite_timeout_stream", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_udplite_net *un) -{ -#ifdef CONFIG_SYSCTL - if (pn->ctl_table) - return 0; - - pn->ctl_table = kmemdup(udplite_sysctl_table, - sizeof(udplite_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED]; - pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED]; -#endif - return 0; -} - -static int udplite_init_net(struct net *net, u_int16_t proto) -{ - struct nf_udplite_net *un = udplite_pernet(net); - struct nf_proto_net *pn = &un->pn; - - if (!pn->users) { - int i; - - for (i = 0 ; i < UDPLITE_CT_MAX; i++) - un->timeouts[i] = udplite_timeouts[i]; - } - - return udplite_kmemdup_sysctl_table(pn, un); -} - -struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_UDPLITE, - .name = "udplite", - .allow_clash = true, - .pkt_to_tuple = udplite_pkt_to_tuple, - .invert_tuple = udplite_invert_tuple, - .print_tuple = udplite_print_tuple, - .packet = udplite_packet, - .get_timeouts = udplite_get_timeouts, - .new = udplite_new, - .error = udplite_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = udplite_timeout_nlattr_to_obj, - .obj_to_nlattr = udplite_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, - .obj_size = sizeof(unsigned int) * - CTA_TIMEOUT_UDPLITE_MAX, - .nla_policy = udplite_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = udplite_init_net, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); - -struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_UDPLITE, - .name = "udplite", - .allow_clash = true, - .pkt_to_tuple = udplite_pkt_to_tuple, - .invert_tuple = udplite_invert_tuple, - .print_tuple = udplite_print_tuple, - .packet = udplite_packet, - .get_timeouts = udplite_get_timeouts, - .new = udplite_new, - .error = udplite_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = udplite_timeout_nlattr_to_obj, - .obj_to_nlattr = udplite_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, - .obj_size = sizeof(unsigned int) * - CTA_TIMEOUT_UDPLITE_MAX, - .nla_policy = udplite_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = udplite_init_net, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index c3fc14e021ec..24174c520239 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct, exp->tuple.dst.protonum != proto || exp->tuple.dst.u.udp.port != port) continue; - if (!del_timer(&exp->timeout)) - continue; - exp->flags &= ~NF_CT_EXPECT_INACTIVE; - exp->timeout.expires = jiffies + expires * HZ; - add_timer(&exp->timeout); - found = 1; - break; + if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) { + exp->flags &= ~NF_CT_EXPECT_INACTIVE; + found = 1; + break; + } } spin_unlock_bh(&nf_conntrack_expect_lock); return found; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index d009ae663453..2256147dcaad 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -642,6 +642,9 @@ static int __init nf_conntrack_standalone_init(void) if (ret < 0) goto out_start; + BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK); + BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); + #ifdef CONFIG_SYSCTL nf_ct_netfilter_header = register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 3dca90dc24ad..8d85a0598b60 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -13,9 +13,11 @@ /* Internal logging interface, which relies on the real LOG target modules */ -#define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 +int sysctl_nf_log_all_netns __read_mostly; +EXPORT_SYMBOL(sysctl_nf_log_all_netns); + static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); @@ -414,6 +416,18 @@ static const struct file_operations nflog_file_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table_header *nf_log_sysctl_fhdr; + +static struct ctl_table nf_log_sysctl_ftable[] = { + { + .procname = "nf_log_all_netns", + .data = &sysctl_nf_log_all_netns, + .maxlen = sizeof(sysctl_nf_log_all_netns), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; static int nf_log_proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -483,6 +497,10 @@ static int netfilter_log_sysctl_init(struct net *net) nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; } + nf_log_sysctl_fhdr = register_net_sysctl(net, "net/netfilter", + nf_log_sysctl_ftable); + if (!nf_log_sysctl_fhdr) + goto err_freg; } for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) @@ -499,6 +517,9 @@ static int netfilter_log_sysctl_init(struct net *net) err_reg: if (!net_eq(net, &init_net)) kfree(table); + else + unregister_net_sysctl_table(nf_log_sysctl_fhdr); +err_freg: err_alloc: return -ENOMEM; } @@ -511,6 +532,8 @@ static void netfilter_log_sysctl_exit(struct net *net) unregister_net_sysctl_table(net->nf.nf_log_dir_header); if (!net_eq(net, &init_net)) kfree(table); + else + unregister_net_sysctl_table(nf_log_sysctl_fhdr); } #else static int netfilter_log_sysctl_init(struct net *net) diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 2840abb5bb99..211661cb2c90 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -60,7 +60,7 @@ static void mangle_contents(struct sk_buff *skb, __skb_trim(skb, skb->len + rep_len - match_len); } - if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) { + if (nf_ct_l3num((struct nf_conn *)skb_nfct(skb)) == NFPROTO_IPV4) { /* fix IP hdr checksum information */ ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index b1e627227b6e..edd4a77dc09a 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -30,20 +30,15 @@ udp_unique_tuple(const struct nf_nat_l3proto *l3proto, &udp_port_rover); } -static bool -udp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) +static void +__udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, struct udphdr *hdr, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, bool do_csum) { - struct udphdr *hdr; __be16 *portptr, newport; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - hdr = (struct udphdr *)(skb->data + hdroff); - if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; @@ -53,7 +48,7 @@ udp_manip_pkt(struct sk_buff *skb, newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } - if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { + if (do_csum) { l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, @@ -62,9 +57,68 @@ udp_manip_pkt(struct sk_buff *skb, hdr->check = CSUM_MANGLED_0; } *portptr = newport; +} + +static bool udp_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + bool do_csum; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; + + __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum); + return true; +} + +#ifdef CONFIG_NF_NAT_PROTO_UDPLITE +static u16 udplite_port_rover; + +static bool udplite_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct udphdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct udphdr *)(skb->data + hdroff); + __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true); return true; } +static void +udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, + &udplite_port_rover); +} + +const struct nf_nat_l4proto nf_nat_l4proto_udplite = { + .l4proto = IPPROTO_UDPLITE, + .manip_pkt = udplite_manip_pkt, + .in_range = nf_nat_l4proto_in_range, + .unique_tuple = udplite_unique_tuple, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; +#endif /* CONFIG_NF_NAT_PROTO_UDPLITE */ + const struct nf_nat_l4proto nf_nat_l4proto_udp = { .l4proto = IPPROTO_UDP, .manip_pkt = udp_manip_pkt, diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c deleted file mode 100644 index 366bfbfd82a1..000000000000 --- a/net/netfilter/nf_nat_proto_udplite.c +++ /dev/null @@ -1,73 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static u16 udplite_port_rover; - -static void -udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct, - &udplite_port_rover); -} - -static bool -udplite_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct udphdr *hdr; - __be16 *portptr, newport; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct udphdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - /* Get rid of source port */ - newport = tuple->src.u.udp.port; - portptr = &hdr->source; - } else { - /* Get rid of dst port */ - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } - - l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); - if (!hdr->check) - hdr->check = CSUM_MANGLED_0; - - *portptr = newport; - return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_udplite = { - .l4proto = IPPROTO_UDPLITE, - .manip_pkt = udplite_manip_pkt, - .in_range = nf_nat_l4proto_in_range, - .unique_tuple = udplite_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a019a87e58ee..ff7304ae58ac 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, if (trans == NULL) return NULL; + if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) { + nft_trans_rule_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); + } nft_trans_rule(trans) = rule; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -576,6 +580,28 @@ err: return err; } +static void _nf_tables_table_disable(struct net *net, + const struct nft_af_info *afi, + struct nft_table *table, + u32 cnt) +{ + struct nft_chain *chain; + u32 i = 0; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + if (cnt && i++ == cnt) + break; + + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); + } +} + static int nf_tables_table_enable(struct net *net, const struct nft_af_info *afi, struct nft_table *table) @@ -598,18 +624,8 @@ static int nf_tables_table_enable(struct net *net, } return 0; err: - list_for_each_entry(chain, &table->chains, list) { - if (!nft_is_active_next(net, chain)) - continue; - if (!(chain->flags & NFT_BASE_CHAIN)) - continue; - - if (i-- <= 0) - break; - - nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, - afi->nops); - } + if (i) + _nf_tables_table_disable(net, afi, table, i); return err; } @@ -617,17 +633,7 @@ static void nf_tables_table_disable(struct net *net, const struct nft_af_info *afi, struct nft_table *table) { - struct nft_chain *chain; - - list_for_each_entry(chain, &table->chains, list) { - if (!nft_is_active_next(net, chain)) - continue; - if (!(chain->flags & NFT_BASE_CHAIN)) - continue; - - nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, - afi->nops); - } + _nf_tables_table_disable(net, afi, table, 0); } static int nf_tables_updtable(struct nft_ctx *ctx) @@ -696,10 +702,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); - table = NULL; - } - - if (table != NULL) { + } else { if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -928,7 +931,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, } static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { - [NFTA_CHAIN_TABLE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, [NFTA_CHAIN_NAME] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, @@ -1854,7 +1858,8 @@ static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { - [NFTA_RULE_TABLE] = { .type = NLA_STRING }, + [NFTA_RULE_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, @@ -2115,7 +2120,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (expr != nft_expr_last(rule) && expr->ops) { nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } @@ -2292,6 +2297,22 @@ err1: return err; } +static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_rule *rule = nft_trans_rule(trans); + + if (trans->msg_type == NFT_MSG_NEWRULE && + id == nft_trans_rule_id(trans)) + return rule; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_delrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2330,6 +2351,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return PTR_ERR(rule); err = nft_delrule(&ctx, rule); + } else if (nla[NFTA_RULE_ID]) { + rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + + err = nft_delrule(&ctx, rule); } else { err = nft_delrule_by_chain(&ctx); } @@ -2397,12 +2424,14 @@ nft_select_set_ops(const struct nlattr * const nla[], features = 0; if (nla[NFTA_SET_FLAGS] != NULL) { features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; + features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT | + NFT_SET_OBJECT; } - bops = NULL; - best.size = ~0; - best.class = ~0; + bops = NULL; + best.size = ~0; + best.lookup = ~0; + best.space = ~0; list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) @@ -2412,16 +2441,27 @@ nft_select_set_ops(const struct nlattr * const nla[], switch (policy) { case NFT_SET_POL_PERFORMANCE: - if (est.class < best.class) - break; - if (est.class == best.class && est.size < best.size) + if (est.lookup < best.lookup) break; + if (est.lookup == best.lookup) { + if (!desc->size) { + if (est.space < best.space) + break; + } else if (est.size < best.size) { + break; + } + } continue; case NFT_SET_POL_MEMORY: - if (est.size < best.size) - break; - if (est.size == best.size && est.class < best.class) + if (!desc->size) { + if (est.space < best.space) + break; + if (est.space == best.space && + est.lookup < best.lookup) + break; + } else if (est.size < best.size) { break; + } continue; default: break; @@ -2443,7 +2483,8 @@ nft_select_set_ops(const struct nlattr * const nla[], } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { - [NFTA_SET_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, @@ -2963,10 +3004,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) return PTR_ERR(set); - set = NULL; - } - - if (set != NULL) { + } else { if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -3084,9 +3122,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); enum nft_registers dreg; @@ -3122,6 +3160,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; + iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) @@ -3192,8 +3231,10 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { - [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; @@ -3303,9 +3344,9 @@ struct nft_set_dump_args { }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { struct nft_set_dump_args *args; @@ -3317,7 +3358,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u8 genmask = nft_genmask_cur(net); - const struct nft_set *set; + struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; @@ -3373,6 +3414,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; + args.iter.flush = false; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3740,10 +3782,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err5; } + if (set->size && + !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { + err = -ENFILE; + goto err6; + } + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; +err6: + set->ops->remove(ctx->net, set, &elem); err5: kfree(trans); err4: @@ -3790,15 +3840,9 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - if (set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) - return -ENFILE; - err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); - if (err < 0) { - atomic_dec(&set->nelems); + if (err < 0) break; - } } return err; } @@ -3883,9 +3927,9 @@ err1: } static int nft_flush_set(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { struct nft_trans *trans; int err; @@ -3895,13 +3939,14 @@ static int nft_flush_set(const struct nft_ctx *ctx, if (!trans) return -ENOMEM; - if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) { + if (!set->ops->flush(ctx->net, set, elem->priv)) { err = -ENOENT; goto err1; } + set->ndeact++; - nft_trans_elem_set(trans) = (struct nft_set *)set; - nft_trans_elem(trans) = *((struct nft_set_elem *)elem); + nft_trans_elem_set(trans) = set; + nft_trans_elem(trans) = *elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -3932,15 +3977,14 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, return -EBUSY; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { - struct nft_set_dump_args args = { - .iter = { - .genmask = genmask, - .fn = nft_flush_set, - }, + struct nft_set_iter iter = { + .genmask = genmask, + .fn = nft_flush_set, + .flush = true, }; - set->ops->walk(&ctx, set, &args.iter); + set->ops->walk(&ctx, set, &iter); - return args.iter.err; + return iter.err; } nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { @@ -4032,8 +4076,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { - [NFTA_OBJ_TABLE] = { .type = NLA_STRING }, - [NFTA_OBJ_NAME] = { .type = NLA_STRING }, + [NFTA_OBJ_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, + [NFTA_OBJ_NAME] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, }; @@ -4153,10 +4199,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, if (err != -ENOENT) return err; - obj = NULL; - } - - if (obj != NULL) { + } else { if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; @@ -4262,10 +4305,11 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (filter->table[0] && + if (filter && filter->table[0] && strcmp(filter->table, table->name)) goto cont; - if (filter->type != NFT_OBJECT_UNSPEC && + if (filter && + filter->type != NFT_OBJECT_UNSPEC && obj->type->type != filter->type) goto cont; @@ -4800,7 +4844,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_DELSETELEM, 0); - te->set->ops->remove(te->set, &te->elem); + te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); te->set->ndeact--; break; @@ -4921,7 +4965,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->remove(te->set, &te->elem); + te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: @@ -4955,6 +4999,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_tables_valid_genid(struct net *net, u32 genid) +{ + return net->nft.base_seq == genid; +} + static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, @@ -4962,6 +5011,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, + .valid_genid = nf_tables_valid_genid, }; int nft_chain_validate_dependency(const struct nft_chain *chain, @@ -5009,9 +5059,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, const struct nft_chain *chain); static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); const struct nft_data *data; @@ -5035,7 +5085,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, { const struct nft_rule *rule; const struct nft_expr *expr, *last; - const struct nft_set *set; + struct nft_set *set; struct nft_set_binding *binding; struct nft_set_iter iter; @@ -5087,6 +5137,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.count = 0; iter.err = 0; iter.fn = nf_tables_loop_check_setelem; + iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a09fa9fd8f3d..68eda920160e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -3,7 +3,7 @@ * * (C) 2001 by Jay Schulist <jschlst@samba.org>, * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> - * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial netfilter messages via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) } EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); -static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type) { - u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + u8 subsys_id = NFNL_SUBSYS_ID(type); if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; @@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t } static inline const struct nfnl_callback * -nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) +nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) { - u_int8_t cb_id = NFNL_MSG_TYPE(type); + u8 cb_id = NFNL_MSG_TYPE(type); if (cb_id >= ss->cb_count) return NULL; @@ -185,7 +185,7 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; @@ -273,13 +273,13 @@ enum { }; static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, - u_int16_t subsys_id) + u16 subsys_id, u32 genid) { struct sk_buff *oskb = skb; struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; - static LIST_HEAD(err_list); + LIST_HEAD(err_list); u32 status; int err; @@ -315,6 +315,12 @@ replay: return kfree_skb(skb); } + if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { + nfnl_unlock(subsys_id); + netlink_ack(oskb, nlh, -ERESTART); + return kfree_skb(skb); + } + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -365,7 +371,7 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; @@ -436,11 +442,51 @@ done: kfree_skb(skb); } +static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { + [NFNL_BATCH_GENID] = { .type = NLA_U32 }, +}; + +static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nlattr *attr = (void *)nlh + min_len; + struct nlattr *cda[NFNL_BATCH_MAX + 1]; + int attrlen = nlh->nlmsg_len - min_len; + struct nfgenmsg *nfgenmsg; + int msglen, err; + u32 gen_id = 0; + u16 res_id; + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) + return; + + err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy); + if (err < 0) { + netlink_ack(skb, nlh, err); + return; + } + if (cda[NFNL_BATCH_GENID]) + gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID])); + + nfgenmsg = nlmsg_data(nlh); + skb_pull(skb, msglen); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + + nfnetlink_rcv_batch(skb, nlh, res_id, gen_id); +} + static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - u_int16_t res_id; - int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) @@ -451,28 +497,10 @@ static void nfnetlink_rcv(struct sk_buff *skb) return; } - if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { - struct nfgenmsg *nfgenmsg; - - msglen = NLMSG_ALIGN(nlh->nlmsg_len); - if (msglen > skb->len) - msglen = skb->len; - - if (nlh->nlmsg_len < NLMSG_HDRLEN || - skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) - return; - - nfgenmsg = nlmsg_data(nlh); - skb_pull(skb, msglen); - /* Work around old nft using host byte order */ - if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) - res_id = NFNL_SUBSYS_NFTABLES; - else - res_id = ntohs(nfgenmsg->res_id); - nfnetlink_rcv_batch(skb, nlh, res_id); - } else { + if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) + nfnetlink_rcv_skb_batch(skb, nlh); + else netlink_rcv_skb(skb, &nfnetlink_rcv_msg); - } } #ifdef CONFIG_MODULES diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 3b79f34b5095..de8782345c86 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -48,7 +48,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, if (helper == NULL) return NF_DROP; - /* This is an user-space helper not yet configured, skip. */ + /* This is a user-space helper not yet configured, skip. */ if ((helper->flags & (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == NF_CT_HELPER_F_USERSPACE) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index e6baeaebe653..bf548a7a71ec 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -32,6 +32,11 @@ struct nft_ct { }; }; +#ifdef CONFIG_NF_CONNTRACK_ZONES +static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); +static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +#endif + static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) @@ -129,12 +134,40 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, &count, sizeof(count)); return; } + case NFT_CT_AVGPKT: { + const struct nf_conn_acct *acct = nf_conn_acct_find(ct); + u64 avgcnt = 0, bcnt = 0, pcnt = 0; + + if (acct) { + pcnt = nft_ct_get_eval_counter(acct->counter, + NFT_CT_PKTS, priv->dir); + bcnt = nft_ct_get_eval_counter(acct->counter, + NFT_CT_BYTES, priv->dir); + if (pcnt != 0) + avgcnt = div64_u64(bcnt, pcnt); + } + + memcpy(dest, &avgcnt, sizeof(avgcnt)); + return; + } case NFT_CT_L3PROTOCOL: *dest = nf_ct_l3num(ct); return; case NFT_CT_PROTOCOL: *dest = nf_ct_protonum(ct); return; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: { + const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + + if (priv->dir < IP_CT_DIR_MAX) + *dest = nf_ct_zone_id(zone, priv->dir); + else + *dest = zone->id; + + return; + } +#endif default: break; } @@ -163,6 +196,53 @@ err: regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void nft_ct_set_zone_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; + const struct nft_ct *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + enum ip_conntrack_info ctinfo; + u16 value = regs->data[priv->sreg]; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) /* already tracked */ + return; + + zone.id = value; + + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + zone.dir = NF_CT_ZONE_DIR_ORIG; + break; + case IP_CT_DIR_REPLY: + zone.dir = NF_CT_ZONE_DIR_REPL; + break; + default: + break; + } + + ct = this_cpu_read(nft_ct_pcpu_template); + + if (likely(atomic_read(&ct->ct_general.use) == 1)) { + nf_ct_zone_add(ct, &zone); + } else { + /* previous skb got queued to userspace */ + ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); + if (!ct) { + regs->verdict.code = NF_DROP; + return; + } + } + + atomic_inc(&ct->ct_general.use); + nf_ct_set(skb, ct, IP_CT_NEW); +} +#endif + static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -241,6 +321,45 @@ static void nft_ct_netns_put(struct net *net, uint8_t family) nf_ct_netns_put(net, family); } +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void nft_ct_tmpl_put_pcpu(void) +{ + struct nf_conn *ct; + int cpu; + + for_each_possible_cpu(cpu) { + ct = per_cpu(nft_ct_pcpu_template, cpu); + if (!ct) + break; + nf_ct_put(ct); + per_cpu(nft_ct_pcpu_template, cpu) = NULL; + } +} + +static bool nft_ct_tmpl_alloc_pcpu(void) +{ + struct nf_conntrack_zone zone = { .id = 0 }; + struct nf_conn *tmp; + int cpu; + + if (nft_ct_pcpu_template_refcnt) + return true; + + for_each_possible_cpu(cpu) { + tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); + if (!tmp) { + nft_ct_tmpl_put_pcpu(); + return false; + } + + atomic_set(&tmp->ct_general.use, 1); + per_cpu(nft_ct_pcpu_template, cpu) = tmp; + } + + return true; +} +#endif + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -250,6 +369,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) @@ -316,11 +436,14 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; case NFT_CT_BYTES: case NFT_CT_PKTS: - /* no direction? return sum of original + reply */ - if (tb[NFTA_CT_DIRECTION] == NULL) - priv->dir = IP_CT_DIR_MAX; + case NFT_CT_AVGPKT: len = sizeof(u64); break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + len = sizeof(u16); + break; +#endif default: return -EOPNOTSUPP; } @@ -346,21 +469,41 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS) + if (priv->key == NFT_CT_BYTES || + priv->key == NFT_CT_PKTS || + priv->key == NFT_CT_AVGPKT) nf_ct_set_acct(ctx->net, true); return 0; } +static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + if (--nft_ct_pcpu_template_refcnt == 0) + nft_ct_tmpl_put_pcpu(); +#endif + default: + break; + } +} + static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); - bool label_got = false; unsigned int len; int err; + priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK @@ -378,13 +521,31 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; - label_got = true; + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + if (!nft_ct_tmpl_alloc_pcpu()) + return -ENOMEM; + nft_ct_pcpu_template_refcnt++; + len = sizeof(u16); break; #endif default: return -EOPNOTSUPP; } + if (tb[NFTA_CT_DIRECTION]) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } + } + priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -397,8 +558,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, return 0; err1: - if (label_got) - nf_connlabels_put(ctx->net); + __nft_ct_set_destroy(ctx, priv); return err; } @@ -413,16 +573,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, { struct nft_ct *priv = nft_expr_priv(expr); - switch (priv->key) { -#ifdef CONFIG_NF_CONNTRACK_LABELS - case NFT_CT_LABELS: - nf_connlabels_put(ctx->net); - break; -#endif - default: - break; - } - + __nft_ct_set_destroy(ctx, priv); nft_ct_netns_put(ctx->net, ctx->afi->family); } @@ -445,6 +596,8 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) break; case NFT_CT_BYTES: case NFT_CT_PKTS: + case NFT_CT_AVGPKT: + case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; @@ -467,6 +620,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_ZONE: + if (priv->dir < IP_CT_DIR_MAX && + nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + break; + default: + break; + } + return 0; nla_put_failure: @@ -492,6 +656,17 @@ static const struct nft_expr_ops nft_ct_set_ops = { .dump = nft_ct_set_dump, }; +#ifdef CONFIG_NF_CONNTRACK_ZONES +static const struct nft_expr_ops nft_ct_set_zone_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_set_zone_eval, + .init = nft_ct_set_init, + .destroy = nft_ct_set_destroy, + .dump = nft_ct_set_dump, +}; +#endif + static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -505,8 +680,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG]) return &nft_ct_get_ops; - if (tb[NFTA_CT_SREG]) + if (tb[NFTA_CT_SREG]) { +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) + return &nft_ct_set_zone_ops; +#endif return &nft_ct_set_ops; + } return ERR_PTR(-EINVAL); } @@ -534,8 +714,7 @@ static void nft_notrack_eval(const struct nft_expr *expr, ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); - skb->nfct = &ct->ct_general; - skb->nfctinfo = IP_CT_NEW; + nf_ct_set(skb, ct, IP_CT_NEW); } static struct nft_expr_type nft_notrack_type; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 7de2f46734a4..049ad2d9ee66 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -98,7 +98,8 @@ out: } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { - [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, + [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, [NFTA_DYNSET_OP] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 47beb3abcc9d..c308920b194c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -15,19 +15,29 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -// FIXME: -#include <net/ipv6.h> +#include <net/tcp.h> struct nft_exthdr { u8 type; u8 offset; u8 len; + u8 op; enum nft_registers dreg:8; + u8 flags; }; -static void nft_exthdr_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static unsigned int optlen(const u8 *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0) + return 1; + else + return opt[offset + 1]; +} + +static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; @@ -35,8 +45,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr, int err; err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); - if (err < 0) + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { goto err; + } offset += priv->offset; dest[priv->len / NFT_REG32_SIZE] = 0; @@ -47,11 +61,59 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_tcp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, optl, tcphdr_len, offset; + u32 *dest = ®s->data[priv->dreg]; + struct tcphdr *tcph; + u8 *opt; + + if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + goto err; + + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff); + if (!tcph) + goto err; + + tcphdr_len = __tcp_hdrlen(tcph); + if (tcphdr_len < sizeof(*tcph)) + goto err; + + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff); + if (!tcph) + goto err; + + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + optl = optlen(opt, i); + + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len || priv->len + priv->offset > optl) + goto err; + + offset = i + priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; + memcpy(dest, opt + offset, priv->len); + + return; + } + +err: + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, @@ -59,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len; + u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; - if (tb[NFTA_EXTHDR_DREG] == NULL || - tb[NFTA_EXTHDR_TYPE] == NULL || - tb[NFTA_EXTHDR_OFFSET] == NULL || - tb[NFTA_EXTHDR_LEN] == NULL) + if (!tb[NFTA_EXTHDR_DREG] || + !tb[NFTA_EXTHDR_TYPE] || + !tb[NFTA_EXTHDR_OFFSET] || + !tb[NFTA_EXTHDR_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); @@ -76,10 +138,27 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (tb[NFTA_EXTHDR_FLAGS]) { + err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags); + if (err < 0) + return err; + + if (flags & ~NFT_EXTHDR_F_PRESENT) + return -EINVAL; + } + + if (tb[NFTA_EXTHDR_OP]) { + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); + if (err < 0) + return err; + } + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); + priv->flags = flags; + priv->op = op; return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, priv->len); @@ -97,6 +176,10 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op))) + goto nla_put_failure; return 0; nla_put_failure: @@ -104,17 +187,45 @@ nla_put_failure: } static struct nft_expr_type nft_exthdr_type; -static const struct nft_expr_ops nft_exthdr_ops = { +static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), - .eval = nft_exthdr_eval, + .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_tcp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + +static const struct nft_expr_ops * +nft_exthdr_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + u32 op; + + if (!tb[NFTA_EXTHDR_OP]) + return &nft_exthdr_ipv6_ops; + + op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP])); + switch (op) { + case NFT_EXTHDR_OP_TCPOPT: + return &nft_exthdr_tcp_ops; + case NFT_EXTHDR_OP_IPV6: + return &nft_exthdr_ipv6_ops; + } + + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", - .ops = &nft_exthdr_ops, + .select_ops = &nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 6271e40a3dd6..6f6e64423643 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -39,7 +39,8 @@ static void nft_log_eval(const struct nft_expr *expr, static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_GROUP] = { .type = NLA_U16 }, - [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, + [NFTA_LOG_PREFIX] = { .type = NLA_STRING, + .len = NF_LOG_PREFIXLEN - 1 }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index d4f97fa7e21d..e21aea7e5ec8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -49,7 +49,8 @@ static void nft_lookup_eval(const struct nft_expr *expr, } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { - [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, + [NFTA_LOOKUP_SET] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 66c7f4b4c49b..e1f5ca9b423b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -154,13 +154,36 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = PACKET_BROADCAST; break; case NFPROTO_IPV6: - if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + *dest = PACKET_MULTICAST; + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + goto err; + + if (ipv4_is_multicast(iph->daddr)) + *dest = PACKET_MULTICAST; + else + *dest = PACKET_BROADCAST; + + break; + } + case htons(ETH_P_IPV6): *dest = PACKET_MULTICAST; - else - *dest = PACKET_BROADCAST; + break; + default: + WARN_ON_ONCE(1); + goto err; + } break; default: - WARN_ON(1); + WARN_ON_ONCE(1); goto err; } break; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 415a65ba2b85..1ae8c49ca4a1 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -193,10 +193,12 @@ nft_objref_select_ops(const struct nft_ctx *ctx, } static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { - [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, - [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 36d2b1096546..7d699bbd45b0 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -250,6 +250,22 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, + __wsum fsum, __wsum tsum, int csum_offset) +{ + __sum16 sum; + + if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + nft_csum_replace(&sum, fsum, tsum); + if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + return 0; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -259,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr, const u32 *src = ®s->data[priv->sreg]; int offset, csum_offset; __wsum fsum, tsum; - __sum16 sum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: @@ -282,18 +297,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr, csum_offset = offset + priv->csum_offset; offset += priv->offset; - if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || skb->ip_summed != CHECKSUM_PARTIAL)) { - if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) - goto err; - fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); - nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || - skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset)) goto err; if (priv->csum_flags && diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 3e19fa1230dc..dbb6aaff67ec 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -38,7 +38,7 @@ static void nft_queue_eval(const struct nft_expr *expr, if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index bd6efc53f26d..2d6fe3559912 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -110,30 +110,32 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { + u64 consumed, consumed_cap; u32 flags = priv->flags; - u64 consumed; - - if (reset) { - consumed = atomic64_xchg(&priv->consumed, 0); - if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) - flags |= NFT_QUOTA_F_DEPLETED; - } else { - consumed = atomic64_read(&priv->consumed); - } /* Since we inconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. */ - if (consumed > priv->quota) - consumed = priv->quota; + consumed = atomic64_read(&priv->consumed); + if (consumed >= priv->quota) { + consumed_cap = priv->quota; + flags |= NFT_QUOTA_F_DEPLETED; + } else { + consumed_cap = consumed; + } if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), NFTA_QUOTA_PAD) || - nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed), + nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap), NFTA_QUOTA_PAD) || nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; + + if (reset) { + atomic64_sub(consumed, &priv->consumed); + clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); + } return 0; nla_put_failure: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c new file mode 100644 index 000000000000..152d226552c1 --- /dev/null +++ b/net/netfilter/nft_set_bitmap.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +/* This bitmap uses two bits to represent one element. These two bits determine + * the element state in the current and the future generation. + * + * An element can be in three states. The generation cursor is represented using + * the ^ character, note that this cursor shifts on every succesful transaction. + * If no transaction is going on, we observe all elements are in the following + * state: + * + * 11 = this element is active in the current generation. In case of no updates, + * ^ it stays active in the next generation. + * 00 = this element is inactive in the current generation. In case of no + * ^ updates, it stays inactive in the next generation. + * + * On transaction handling, we observe these two temporary states: + * + * 01 = this element is inactive in the current generation and it becomes active + * ^ in the next one. This happens when the element is inserted but commit + * path has not yet been executed yet, so activation is still pending. On + * transaction abortion, the element is removed. + * 10 = this element is active in the current generation and it becomes inactive + * ^ in the next one. This happens when the element is deactivated but commit + * path has not yet been executed yet, so removal is still pending. On + * transation abortion, the next generation bit is reset to go back to + * restore its previous state. + */ +struct nft_bitmap { + u16 bitmap_size; + u8 bitmap[]; +}; + +static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off) +{ + u32 k = (key << 1); + + *idx = k / BITS_PER_BYTE; + *off = k % BITS_PER_BYTE; +} + +/* Fetch the two bits that represent the element and check if it is active based + * on the generation mask. + */ +static inline bool +nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) +{ + return (bitmap[idx] & (0x3 << off)) & (genmask << off); +} + +static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + u32 idx, off; + + nft_bitmap_location(*key, &idx, &off); + + return nft_bitmap_active(priv->bitmap, idx, off, genmask); +} + +static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **_ext) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return -EEXIST; + + /* Enter 01 state. */ + priv->bitmap[idx] |= (genmask << off); + + return 0; +} + +static void nft_bitmap_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 00 state. */ + priv->bitmap[idx] &= ~(genmask << off); +} + +static void nft_bitmap_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 11 state. */ + priv->bitmap[idx] |= (genmask << off); +} + +static bool nft_bitmap_flush(const struct net *net, + const struct nft_set *set, void *ext) +{ + struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 10 state, similar to deactivation. */ + priv->bitmap[idx] &= ~(genmask << off); + + return true; +} + +static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_ext_tmpl tmpl; + struct nft_set_ext *ext; + + nft_set_ext_prepare(&tmpl); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + ext = kzalloc(tmpl.len, GFP_KERNEL); + if (!ext) + return NULL; + + nft_set_ext_init(ext, &tmpl); + memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen); + + return ext; +} + +static void *nft_bitmap_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); + struct nft_set_ext *ext; + u32 idx, off, key = 0; + + memcpy(&key, elem->key.val.data, set->klen); + nft_bitmap_location(key, &idx, &off); + + if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return NULL; + + /* We have no real set extension since this is a bitmap, allocate this + * dummy object that is released from the commit/abort path. + */ + ext = nft_bitmap_ext_alloc(set, elem); + if (!ext) + return NULL; + + /* Enter 10 state. */ + priv->bitmap[idx] &= ~(genmask << off); + + return ext; +} + +static void nft_bitmap_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext_tmpl tmpl; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int idx, off; + u16 key; + + nft_set_ext_prepare(&tmpl); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + for (idx = 0; idx < priv->bitmap_size; idx++) { + for (off = 0; off < BITS_PER_BYTE; off += 2) { + if (iter->count < iter->skip) + goto cont; + + if (!nft_bitmap_active(priv->bitmap, idx, off, + iter->genmask)) + goto cont; + + ext = kzalloc(tmpl.len, GFP_KERNEL); + if (!ext) { + iter->err = -ENOMEM; + return; + } + nft_set_ext_init(ext, &tmpl); + key = ((idx * BITS_PER_BYTE) + off) >> 1; + memcpy(nft_set_ext_key(ext), &key, set->klen); + + elem.priv = ext; + iter->err = iter->fn(ctx, set, iter, &elem); + + /* On set flush, this dummy extension object is released + * from the commit/abort path. + */ + if (!iter->flush) + kfree(ext); + + if (iter->err < 0) + return; +cont: + iter->count++; + } + } +} + +/* The bitmap size is pow(2, key length in bits) / bits per byte. This is + * multiplied by two since each element takes two bits. For 8 bit keys, the + * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes. + */ +static inline u32 nft_bitmap_size(u32 klen) +{ + return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; +} + +static inline u32 nft_bitmap_total_size(u32 klen) +{ + return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); +} + +static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[]) +{ + u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + + return nft_bitmap_total_size(klen); +} + +static int nft_bitmap_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_bitmap *priv = nft_set_priv(set); + + priv->bitmap_size = nft_bitmap_size(set->klen); + + return 0; +} + +static void nft_bitmap_destroy(const struct nft_set *set) +{ +} + +static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ + if (desc->klen > 2) + return false; + + est->size = nft_bitmap_total_size(desc->klen); + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_1; + + return true; +} + +static struct nft_set_ops nft_bitmap_ops __read_mostly = { + .privsize = nft_bitmap_privsize, + .estimate = nft_bitmap_estimate, + .init = nft_bitmap_init, + .destroy = nft_bitmap_destroy, + .insert = nft_bitmap_insert, + .remove = nft_bitmap_remove, + .deactivate = nft_bitmap_deactivate, + .flush = nft_bitmap_flush, + .activate = nft_bitmap_activate, + .lookup = nft_bitmap_lookup, + .walk = nft_bitmap_walk, + .owner = THIS_MODULE, +}; + +static int __init nft_bitmap_module_init(void) +{ + return nft_register_set(&nft_bitmap_ops); +} + +static void __exit nft_bitmap_module_exit(void) +{ + nft_unregister_set(&nft_bitmap_ops); +} + +module_init(nft_bitmap_module_init); +module_exit(nft_bitmap_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 1e20e2bbb6d9..5f652720fc78 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -167,8 +167,8 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, nft_set_elem_clear_busy(&he->ext); } -static bool nft_hash_deactivate_one(const struct net *net, - const struct nft_set *set, void *priv) +static bool nft_hash_flush(const struct net *net, + const struct nft_set *set, void *priv) { struct nft_hash_elem *he = priv; @@ -195,7 +195,7 @@ static void *nft_hash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); if (he != NULL && - !nft_hash_deactivate_one(net, set, he)) + !nft_hash_flush(net, set, he)) he = NULL; rcu_read_unlock(); @@ -203,7 +203,8 @@ static void *nft_hash_deactivate(const struct net *net, return he; } -static void nft_hash_remove(const struct nft_set *set, +static void nft_hash_remove(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); @@ -212,7 +213,7 @@ static void nft_hash_remove(const struct nft_set *set, rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); } -static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, +static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_hash *priv = nft_set_priv(set); @@ -383,7 +384,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, est->size = esize + 2 * sizeof(struct nft_hash_elem *); } - est->class = NFT_SET_CLASS_O_1; + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; return true; } @@ -397,12 +399,12 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .insert = nft_hash_insert, .activate = nft_hash_activate, .deactivate = nft_hash_deactivate, - .deactivate_one = nft_hash_deactivate_one, + .flush = nft_hash_flush, .remove = nft_hash_remove, .lookup = nft_hash_lookup, .update = nft_hash_update, .walk = nft_hash_walk, - .features = NFT_SET_MAP | NFT_SET_TIMEOUT, + .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 08376e50f6cd..71e8fb886a73 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -151,7 +151,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, return err; } -static void nft_rbtree_remove(const struct nft_set *set, +static void nft_rbtree_remove(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree *priv = nft_set_priv(set); @@ -171,8 +172,8 @@ static void nft_rbtree_activate(const struct net *net, nft_set_elem_change_active(net, set, &rbe->ext); } -static bool nft_rbtree_deactivate_one(const struct net *net, - const struct nft_set *set, void *priv) +static bool nft_rbtree_flush(const struct net *net, + const struct nft_set *set, void *priv) { struct nft_rbtree_elem *rbe = priv; @@ -213,7 +214,7 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; continue; } - nft_rbtree_deactivate_one(net, set, rbe); + nft_rbtree_flush(net, set, rbe); return rbe; } } @@ -221,7 +222,7 @@ static void *nft_rbtree_deactivate(const struct net *net, } static void nft_rbtree_walk(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, struct nft_set_iter *iter) { const struct nft_rbtree *priv = nft_set_priv(set); @@ -290,7 +291,8 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, else est->size = nsize; - est->class = NFT_SET_CLASS_O_LOG_N; + est->lookup = NFT_SET_CLASS_O_LOG_N; + est->space = NFT_SET_CLASS_O_N; return true; } @@ -304,11 +306,11 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, .deactivate = nft_rbtree_deactivate, - .deactivate_one = nft_rbtree_deactivate_one, + .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, - .features = NFT_SET_INTERVAL | NFT_SET_MAP, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, .owner = THIS_MODULE, }; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2ff499680cc6..14857afc9937 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -262,6 +262,60 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) } EXPORT_SYMBOL_GPL(xt_request_find_target); + +static int xt_obj_to_user(u16 __user *psize, u16 size, + void __user *pname, const char *name, + u8 __user *prev, u8 rev) +{ + if (put_user(size, psize)) + return -EFAULT; + if (copy_to_user(pname, name, strlen(name) + 1)) + return -EFAULT; + if (put_user(rev, prev)) + return -EFAULT; + + return 0; +} + +#define XT_OBJ_TO_USER(U, K, TYPE, C_SIZE) \ + xt_obj_to_user(&U->u.TYPE##_size, C_SIZE ? : K->u.TYPE##_size, \ + U->u.user.name, K->u.kernel.TYPE->name, \ + &U->u.user.revision, K->u.kernel.TYPE->revision) + +int xt_data_to_user(void __user *dst, const void *src, + int usersize, int size) +{ + usersize = usersize ? : size; + if (copy_to_user(dst, src, usersize)) + return -EFAULT; + if (usersize != size && clear_user(dst + usersize, size - usersize)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(xt_data_to_user); + +#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ + xt_data_to_user(U->data, K->data, \ + K->u.kernel.TYPE->usersize, \ + C_SIZE ? : K->u.kernel.TYPE->TYPE##size) + +int xt_match_to_user(const struct xt_entry_match *m, + struct xt_entry_match __user *u) +{ + return XT_OBJ_TO_USER(u, m, match, 0) || + XT_DATA_TO_USER(u, m, match, 0); +} +EXPORT_SYMBOL_GPL(xt_match_to_user); + +int xt_target_to_user(const struct xt_entry_target *t, + struct xt_entry_target __user *u) +{ + return XT_OBJ_TO_USER(u, t, target, 0) || + XT_DATA_TO_USER(u, t, target, 0); +} +EXPORT_SYMBOL_GPL(xt_target_to_user); + static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; @@ -565,17 +619,14 @@ int xt_compat_match_to_user(const struct xt_entry_match *m, int off = xt_compat_match_offset(match); u_int16_t msize = m->u.user.match_size - off; - if (copy_to_user(cm, m, sizeof(*cm)) || - put_user(msize, &cm->u.user.match_size) || - copy_to_user(cm->u.user.name, m->u.kernel.match->name, - strlen(m->u.kernel.match->name) + 1)) + if (XT_OBJ_TO_USER(cm, m, match, msize)) return -EFAULT; if (match->compat_to_user) { if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { - if (copy_to_user(cm->data, m->data, msize - sizeof(*cm))) + if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } @@ -616,7 +667,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset) return -EINVAL; - /* compat_xt_entry match has less strict aligment requirements, + /* compat_xt_entry match has less strict alignment requirements, * otherwise they are identical. In case of padding differences * we need to add compat version of xt_check_entry_match. */ @@ -923,17 +974,14 @@ int xt_compat_target_to_user(const struct xt_entry_target *t, int off = xt_compat_target_offset(target); u_int16_t tsize = t->u.user.target_size - off; - if (copy_to_user(ct, t, sizeof(*ct)) || - put_user(tsize, &ct->u.user.target_size) || - copy_to_user(ct->u.user.name, t->u.kernel.target->name, - strlen(t->u.kernel.target->name) + 1)) + if (XT_OBJ_TO_USER(ct, t, target, tsize)) return -EFAULT; if (target->compat_to_user) { if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { - if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct))) + if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 95c750358747..b008db0184b8 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -23,15 +23,14 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) { /* Previously seen (loopback)? Ignore. */ - if (skb->nfct != NULL) + if (skb->_nfct != 0) return XT_CONTINUE; /* special case the untracked ct : we want the percpu object */ if (!ct) ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); - skb->nfct = &ct->ct_general; - skb->nfctinfo = IP_CT_NEW; + nf_ct_set(skb, ct, IP_CT_NEW); return XT_CONTINUE; } @@ -373,6 +372,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .name = "CT", .family = NFPROTO_UNSPEC, .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, .destroy = xt_ct_tg_destroy_v0, .target = xt_ct_target_v0, @@ -384,6 +384,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_UNSPEC, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -395,6 +396,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_UNSPEC, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -407,12 +409,11 @@ static unsigned int notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) { /* Previously seen (loopback)? Ignore. */ - if (skb->nfct != NULL) + if (skb->_nfct != 0) return XT_CONTINUE; - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); + nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); + nf_conntrack_get(skb_nfct(skb)); return XT_CONTINUE; } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 91a373a3f534..498b54fd04d7 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -162,6 +162,7 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = { .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 1c57ace75ae6..86b0580b2216 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -133,6 +133,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), + .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, @@ -144,6 +145,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), + .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 2dedaa23ab0a..38986a95216c 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -110,6 +110,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = { .match = bpf_mt, .destroy = bpf_mt_destroy, .matchsize = sizeof(struct xt_bpf_info), + .usersize = offsetof(struct xt_bpf_info, filter), .me = THIS_MODULE, }, { @@ -120,6 +121,7 @@ static struct xt_match bpf_mt_reg[] __read_mostly = { .match = bpf_mt_v1, .destroy = bpf_mt_destroy_v1, .matchsize = sizeof(struct xt_bpf_info_v1), + .usersize = offsetof(struct xt_bpf_info_v1, filter), .me = THIS_MODULE, }, }; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index a086a914865f..1db1ce59079f 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -122,6 +122,7 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = { .checkentry = cgroup_mt_check_v1, .match = cgroup_mt_v1, .matchsize = sizeof(struct xt_cgroup_info_v1), + .usersize = offsetof(struct xt_cgroup_info_v1, priv), .destroy = cgroup_mt_destroy_v1, .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 2aff2b7c4689..b8fd4ab762ed 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -218,7 +218,7 @@ count_tree(struct net *net, struct rb_root *root, int diff; bool addit; - rbconn = container_of(*rbnode, struct xt_connlimit_rb, node); + rbconn = rb_entry(*rbnode, struct xt_connlimit_rb, node); parent = *rbnode; diff = same_source_net(addr, mask, &rbconn->addr, family); @@ -398,7 +398,7 @@ static void destroy_tree(struct rb_root *r) struct rb_node *node; while ((node = rb_first(r)) != NULL) { - rbconn = container_of(node, struct xt_connlimit_rb, node); + rbconn = rb_entry(node, struct xt_connlimit_rb, node); rb_erase(node, r); @@ -431,6 +431,7 @@ static struct xt_match connlimit_mt_reg __read_mostly = { .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 10063408141d..2a6dfe8b74d3 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -463,23 +463,16 @@ static u32 xt_hashlimit_len_to_chunks(u32 len) /* Precision saver. */ static u64 user2credits(u64 user, int revision) { - if (revision == 1) { - /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) - /* Divide first. */ - return div64_u64(user, XT_HASHLIMIT_SCALE) - * HZ * CREDITS_PER_JIFFY_v1; - - return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1, - XT_HASHLIMIT_SCALE); - } else { - if (user > 0xFFFFFFFFFFFFFFFFULL / (HZ*CREDITS_PER_JIFFY)) - return div64_u64(user, XT_HASHLIMIT_SCALE_v2) - * HZ * CREDITS_PER_JIFFY; + u64 scale = (revision == 1) ? + XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2; + u64 cpj = (revision == 1) ? + CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; - return div64_u64(user * HZ * CREDITS_PER_JIFFY, - XT_HASHLIMIT_SCALE_v2); - } + /* Avoid overflow: divide the constant operands first */ + if (scale >= HZ * cpj) + return div64_u64(user, div64_u64(scale, HZ * cpj)); + + return user * div64_u64(HZ * cpj, scale); } static u32 user2credits_byte(u32 user) @@ -838,6 +831,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check_v1, .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, @@ -848,6 +842,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -859,6 +854,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check_v1, .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, @@ -869,6 +865,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), + .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index bef850596558..dab962df1787 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -192,6 +192,8 @@ static struct xt_match limit_mt_reg __read_mostly = { .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, +#else + .usersize = offsetof(struct xt_rateinfo, prev), #endif .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 57efb703ff18..1ef99151b3ba 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -33,8 +33,7 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) else if (xt_family(par) == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (xt_family(par) == NFPROTO_IPV6 && - ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + else if (xt_family(par) == NFPROTO_IPV6) type = PACKET_MULTICAST; else type = PACKET_BROADCAST; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 44c8eb4c9d66..10d61a6eed71 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -73,6 +73,7 @@ static struct xt_match quota_mt_reg __read_mostly = { .checkentry = quota_mt_check, .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), + .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 1db02f6fca54..755d2f6693a2 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -133,6 +133,7 @@ static struct xt_match xt_rateest_mt_reg __read_mostly = { .checkentry = xt_rateest_mt_checkentry, .destroy = xt_rateest_mt_destroy, .matchsize = sizeof(struct xt_rateest_match_info), + .usersize = offsetof(struct xt_rateest_match_info, est1), .me = THIS_MODULE, }; diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 0bc3460319c8..423293ee57c2 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -77,6 +77,7 @@ static struct xt_match xt_string_mt_reg __read_mostly = { .match = string_mt, .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), + .usersize = offsetof(struct xt_string_info, config), .me = THIS_MODULE, }; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 28c56b95fb7f..ea7c67050792 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1502,10 +1502,7 @@ static int __init netlbl_init(void) printk(KERN_INFO "NetLabel: Initializing\n"); printk(KERN_INFO "NetLabel: domain hash size = %u\n", (1 << NETLBL_DOMHSH_BITSIZE)); - printk(KERN_INFO "NetLabel: protocols =" - " UNLABELED" - " CIPSOv4" - "\n"); + printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n"); ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); if (ret_val != 0) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 161b628ab2b0..7b73c7c161a9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1210,9 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) skb = nskb; } - if (!pskb_expand_head(skb, 0, -delta, allocation)) - skb->truesize -= delta; - + pskb_expand_head(skb, 0, -delta, + (allocation & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOWARN | __GFP_NORETRY); return skb; } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 514f7bcf7c63..c82301ce3fff 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -796,9 +796,8 @@ static void ovs_fragment(struct net *net, struct vport *vport, unsigned long orig_dst; struct rt6_info ovs_rt; - if (!v6ops) { + if (!v6ops) goto err; - } prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); @@ -1074,6 +1073,8 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_ZONE: case OVS_KEY_ATTR_CT_MARK: case OVS_KEY_ATTR_CT_LABELS: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: err = -EINVAL; break; } @@ -1141,12 +1142,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len) { - /* Every output action needs a separate clone of 'skb', but the common - * case is just a single output action, so that doing a clone and - * then freeing the original skbuff is wasteful. So the following code - * is slightly obscure just to avoid that. - */ - int prev_port = -1; const struct nlattr *a; int rem; @@ -1154,20 +1149,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (unlikely(prev_port != -1)) { - struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); - - if (out_skb) - do_output(dp, out_skb, prev_port, key); + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: { + int port = nla_get_u32(a); + struct sk_buff *clone; + + /* Every output action needs a separate clone + * of 'skb', In case the output action is the + * last action, cloning can be avoided. + */ + if (nla_is_last(a, rem)) { + do_output(dp, skb, port, key); + /* 'skb' has been used for output. + */ + return 0; + } + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) + do_output(dp, clone, port, key); OVS_CB(skb)->cutlen = 0; - prev_port = -1; - } - - switch (nla_type(a)) { - case OVS_ACTION_ATTR_OUTPUT: - prev_port = nla_get_u32(a); break; + } case OVS_ACTION_ATTR_TRUNC: { struct ovs_action_trunc *trunc = nla_data(a); @@ -1257,11 +1260,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } } - if (prev_port != -1) - do_output(dp, skb, prev_port, key); - else - consume_skb(skb); - + consume_skb(skb); return 0; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 6b78bab27755..e0a87776a010 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -65,6 +65,7 @@ struct ovs_conntrack_info { struct nf_conn *ct; u8 commit : 1; u8 nat : 3; /* enum ovs_ct_nat */ + u8 force : 1; u16 family; struct md_mark mark; struct md_labels labels; @@ -73,6 +74,8 @@ struct ovs_conntrack_info { #endif }; +static bool labels_nonzero(const struct ovs_key_ct_labels *labels); + static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -129,21 +132,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct) #endif } +/* Guard against conntrack labels max size shrinking below 128 bits. */ +#if NF_CT_LABELS_MAX_SIZE < 16 +#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes +#endif + static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; - if (cl) { - size_t len = sizeof(cl->bits); + if (cl) + memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); + else + memset(labels, 0, OVS_CT_LABELS_LEN); +} - if (len > OVS_CT_LABELS_LEN) - len = OVS_CT_LABELS_LEN; - else if (len < OVS_CT_LABELS_LEN) - memset(labels, 0, OVS_CT_LABELS_LEN); - memcpy(labels, cl->bits, len); +static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, + const struct nf_conntrack_tuple *orig, + u8 icmp_proto) +{ + key->ct_orig_proto = orig->dst.protonum; + if (orig->dst.protonum == icmp_proto) { + key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); + key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); } else { - memset(labels, 0, OVS_CT_LABELS_LEN); + key->ct.orig_tp.src = orig->src.u.all; + key->ct.orig_tp.dst = orig->dst.u.all; } } @@ -151,13 +166,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, const struct nf_conntrack_zone *zone, const struct nf_conn *ct) { - key->ct.state = state; - key->ct.zone = zone->id; + key->ct_state = state; + key->ct_zone = zone->id; key->ct.mark = ovs_ct_get_mark(ct); ovs_ct_get_labels(ct, &key->ct.labels); + + if (ct) { + const struct nf_conntrack_tuple *orig; + + /* Use the master if we have one. */ + if (ct->master) + ct = ct->master; + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* IP version must match with the master connection. */ + if (key->eth.type == htons(ETH_P_IP) && + nf_ct_l3num(ct) == NFPROTO_IPV4) { + key->ipv4.ct_orig.src = orig->src.u3.ip; + key->ipv4.ct_orig.dst = orig->dst.u3.ip; + __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); + return; + } else if (key->eth.type == htons(ETH_P_IPV6) && + !sw_flow_key_is_nd(key) && + nf_ct_l3num(ct) == NFPROTO_IPV6) { + key->ipv6.ct_orig.src = orig->src.u3.in6; + key->ipv6.ct_orig.dst = orig->dst.u3.in6; + __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); + return; + } + } + /* Clear 'ct_orig_proto' to mark the non-existence of conntrack + * original direction key fields. + */ + key->ct_orig_proto = 0; } -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has +/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has * previously sent the packet to conntrack via the ct action. If * 'keep_nat_flags' is true, the existing NAT flags retained, else they are * initialized from the connection status. @@ -184,7 +228,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb, if (ct->master) state |= OVS_CS_F_RELATED; if (keep_nat_flags) { - state |= key->ct.state & OVS_CS_F_NAT_MASK; + state |= key->ct_state & OVS_CS_F_NAT_MASK; } else { if (ct->status & IPS_SRC_NAT) state |= OVS_CS_F_SRC_NAT; @@ -208,44 +252,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) ovs_ct_update_key(skb, NULL, key, false, false); } -int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +#define IN6_ADDR_INITIALIZER(ADDR) \ + { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ + (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } + +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { - if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && - nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && - nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), - &key->ct.labels)) + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), + &output->ct.labels)) return -EMSGSIZE; + if (swkey->ct_orig_proto) { + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ct_tuple_ipv4 orig = { + output->ipv4.ct_orig.src, + output->ipv4.ct_orig.dst, + output->ct.orig_tp.src, + output->ct.orig_tp.dst, + output->ct_orig_proto, + }; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, + sizeof(orig), &orig)) + return -EMSGSIZE; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ct_tuple_ipv6 orig = { + IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), + IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), + output->ct.orig_tp.src, + output->ct.orig_tp.dst, + output->ct_orig_proto, + }; + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, + sizeof(orig), &orig)) + return -EMSGSIZE; + } + } + return 0; } -static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, +static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, u32 ct_mark, u32 mask) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; u32 new_mark; - /* The connection could be invalid, in which case set_mark is no-op. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; - new_mark = ct_mark | (ct->mark & ~(mask)); if (ct->mark != new_mark) { ct->mark = new_mark; - nf_conntrack_event_cache(IPCT_MARK, ct); + if (nf_ct_is_confirmed(ct)) + nf_conntrack_event_cache(IPCT_MARK, ct); key->ct.mark = new_mark; } @@ -255,34 +324,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, #endif } -static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ct_labels *labels, - const struct ovs_key_ct_labels *mask) +static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; - struct nf_conn *ct; - int err; - - /* The connection could be invalid, in which case set_label is no-op.*/ - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; cl = nf_ct_labels_find(ct); if (!cl) { nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) + + return cl; +} + +/* Initialize labels for a new, yet to be committed conntrack entry. Note that + * since the new connection is not yet confirmed, and thus no-one else has + * access to it's labels, we simply write them over. + */ +static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl, *master_cl; + bool have_mask = labels_nonzero(mask); + + /* Inherit master's labels to the related connection? */ + master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; + + if (!master_cl && !have_mask) + return 0; /* Nothing to do. */ + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) + return -ENOSPC; + + /* Inherit the master's labels, if any. */ + if (master_cl) + *cl = *master_cl; + + if (have_mask) { + u32 *dst = (u32 *)cl->bits; + int i; + + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | + (labels->ct_labels_32[i] + & mask->ct_labels_32[i]); + } + + /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the + * IPCT_LABEL bit it set in the event cache. + */ + nf_conntrack_event_cache(IPCT_LABEL, ct); + + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + + return 0; +} + +static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl; + int err; + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) return -ENOSPC; - err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, - OVS_CT_LABELS_LEN / sizeof(u32)); + err = nf_connlabels_replace(ct, labels->ct_labels_32, + mask->ct_labels_32, + OVS_CT_LABELS_LEN_32); if (err) return err; - ovs_ct_get_labels(ct, &key->ct.labels); + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + return 0; } @@ -367,7 +485,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); if (err) { @@ -421,16 +538,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) /* Find an existing connection which this packet belongs to without * re-attributing statistics or modifying the connection state. This allows an - * skb->nfct lost due to an upcall to be recovered during actions execution. + * skb->_nfct lost due to an upcall to be recovered during actions execution. * * Must be called with rcu_read_lock. * - * On success, populates skb->nfct and skb->nfctinfo, and returns the - * connection. Returns NULL if there is no existing entry. + * On success, populates skb->_nfct and returns the connection. Returns NULL + * if there is no existing entry. */ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, - u8 l3num, struct sk_buff *skb) + u8 l3num, struct sk_buff *skb, bool natted) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -453,6 +570,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, return NULL; } + /* Must invert the tuple if skb has been transformed by NAT. */ + if (natted) { + struct nf_conntrack_tuple inverse; + + if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { + pr_debug("ovs_ct_find_existing: Inversion failed!\n"); + return NULL; + } + tuple = inverse; + } + /* look for tuple match */ h = nf_conntrack_find_get(net, zone, &tuple); if (!h) @@ -460,12 +588,18 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, ct = nf_ct_tuplehash_to_ctrack(h); - skb->nfct = &ct->ct_general; - skb->nfctinfo = ovs_ct_get_info(h); + /* Inverted packet tuple matches the reverse direction conntrack tuple, + * select the other tuplehash to get the right 'ctinfo' bits for this + * packet. + */ + if (natted) + h = &ct->tuplehash[!h->tuple.dst.dir]; + + nf_ct_set(skb, ct, ovs_ct_get_info(h)); return ct; } -/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool skb_nfct_cached(struct net *net, const struct sw_flow_key *key, const struct ovs_conntrack_info *info, @@ -476,14 +610,19 @@ static bool skb_nfct_cached(struct net *net, ct = nf_ct_get(skb, &ctinfo); /* If no ct, check if we have evidence that an existing conntrack entry - * might be found for this skb. This happens when we lose a skb->nfct + * might be found for this skb. This happens when we lose a skb->_nfct * due to an upcall. If the connection was not confirmed, it is not * cached and needs to be run through conntrack again. */ - if (!ct && key->ct.state & OVS_CS_F_TRACKED && - !(key->ct.state & OVS_CS_F_INVALID) && - key->ct.zone == info->zone.id) - ct = ovs_ct_find_existing(net, &info->zone, info->family, skb); + if (!ct && key->ct_state & OVS_CS_F_TRACKED && + !(key->ct_state & OVS_CS_F_INVALID) && + key->ct_zone == info->zone.id) { + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, + !!(key->ct_state + & OVS_CS_F_NAT_MASK)); + if (ct) + nf_ct_get(skb, &ctinfo); + } if (!ct) return false; if (!net_eq(net, read_pnet(&ct->ct_net))) @@ -497,6 +636,18 @@ static bool skb_nfct_cached(struct net *net, if (help && rcu_access_pointer(help->helper) != info->helper) return false; } + /* Force conntrack entry direction to the current packet? */ + if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + /* Delete the conntrack entry if confirmed, else just release + * the reference. + */ + if (nf_ct_is_confirmed(ct)) + nf_ct_delete(ct, 0, 0); + else + nf_conntrack_put(&ct->ct_general); + nf_ct_set(skb, NULL, 0); + return false; + } return true; } @@ -514,7 +665,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, int hooknum, nh_off, err = NF_ACCEPT; nh_off = skb_network_offset(skb); - skb_pull(skb, nh_off); + skb_pull_rcsum(skb, nh_off); /* See HOOK2MANIP(). */ if (maniptype == NF_NAT_MANIP_SRC) @@ -579,6 +730,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: skb_push(skb, nh_off); + skb_postpush_rcsum(skb, skb->data, nh_off); return err; } @@ -590,7 +742,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key, if (maniptype == NF_NAT_MANIP_SRC) { __be16 src; - key->ct.state |= OVS_CS_F_SRC_NAT; + key->ct_state |= OVS_CS_F_SRC_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.src = ip_hdr(skb)->saddr; else if (key->eth.type == htons(ETH_P_IPV6)) @@ -612,7 +764,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key, } else { __be16 dst; - key->ct.state |= OVS_CS_F_DST_NAT; + key->ct_state |= OVS_CS_F_DST_NAT; if (key->eth.type == htons(ETH_P_IP)) key->ipv4.addr.dst = ip_hdr(skb)->daddr; else if (key->eth.type == htons(ETH_P_IPV6)) @@ -699,7 +851,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if * not done already. Update key with new CT state after passing the packet * through conntrack. - * Note that if the packet is deemed invalid by conntrack, skb->nfct will be + * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be * set to NULL and 0 will be returned. */ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, @@ -721,11 +873,10 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, /* Associate skb with specified zone. */ if (tmpl) { - if (skb->nfct) - nf_conntrack_put(skb->nfct); + if (skb_nfct(skb)) + nf_conntrack_put(skb_nfct(skb)); nf_conntrack_get(&tmpl->ct_general); - skb->nfct = &tmpl->ct_general; - skb->nfctinfo = IP_CT_NEW; + nf_ct_set(skb, tmpl, IP_CT_NEW); } err = nf_conntrack_in(net, info->family, @@ -737,7 +888,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * NAT after the nf_conntrack_in() call. We can actually clear * the whole state, as it will be re-initialized below. */ - key->ct.state = 0; + key->ct_state = 0; /* Update the key, but keep the NAT flags. */ ovs_ct_update_key(skb, info, key, true, true); @@ -753,9 +904,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * * NAT will be done only if the CT action has NAT, and only * once per packet (per zone), as guarded by the NAT bits in - * the key->ct.state. + * the key->ct_state. */ - if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) && + if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && (nf_ct_is_confirmed(ct) || info->commit) && ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { return -EINVAL; @@ -819,7 +970,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, if (err) return err; - ct = (struct nf_conn *)skb->nfct; + ct = (struct nf_conn *)skb_nfct(skb); if (ct) nf_ct_deliver_cached_events(ct); } @@ -831,8 +982,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels) { size_t i; - for (i = 0; i < sizeof(*labels); i++) - if (labels->ct_labels[i]) + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + if (labels->ct_labels_32[i]) return true; return false; @@ -843,24 +994,36 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; int err; err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; + /* The connection could be invalid, in which case this is a no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + /* Apply changes before confirming the connection so that the initial * conntrack NEW netlink event carries the values given in the CT * action. */ if (info->mark.mask) { - err = ovs_ct_set_mark(skb, key, info->mark.value, + err = ovs_ct_set_mark(ct, key, info->mark.value, info->mark.mask); if (err) return err; } - if (labels_nonzero(&info->labels.mask)) { - err = ovs_ct_set_labels(skb, key, &info->labels.value, + if (!nf_ct_is_confirmed(ct)) { + err = ovs_ct_init_labels(ct, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } else if (labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) return err; @@ -886,7 +1049,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, /* The conntrack module expects to be working at L3. */ nh_ofs = skb_network_offset(skb); - skb_pull(skb, nh_ofs); + skb_pull_rcsum(skb, nh_ofs); if (key->ip.frag != OVS_FRAG_TYPE_NONE) { err = handle_fragments(net, key, info->zone.id, skb); @@ -900,6 +1063,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_lookup(net, key, info, skb); skb_push(skb, nh_ofs); + skb_postpush_rcsum(skb, skb->data, nh_ofs); if (err) kfree_skb(skb); return err; @@ -1061,6 +1225,7 @@ static int parse_nat(const struct nlattr *attr, static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), .maxlen = sizeof(u16) }, [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), @@ -1100,6 +1265,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } switch (type) { + case OVS_CT_ATTR_FORCE_COMMIT: + info->force = true; + /* fall through. */ case OVS_CT_ATTR_COMMIT: info->commit = true; break; @@ -1326,7 +1494,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (!start) return -EMSGSIZE; - if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) + if (ct_info->commit && nla_put_flag(skb, ct_info->force + ? OVS_CT_ATTR_FORCE_COMMIT + : OVS_CT_ATTR_COMMIT)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 8f6230bd6183..bc7efd1867ab 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); -int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb); void ovs_ct_free_action(const struct nlattr *a); #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ @@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - key->ct.state = 0; - key->ct.zone = 0; + key->ct_state = 0; + key->ct_zone = 0; key->ct.mark = 0; memset(&key->ct.labels, 0, sizeof(key->ct.labels)); + /* Clear 'ct_orig_proto' to mark the non-existence of original + * direction key fields. + */ + key->ct_orig_proto = 0; } -static inline int ovs_ct_put_key(const struct sw_flow_key *key, +static inline int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { return 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2d4c4d3911c0..9c62b6325f7a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -606,7 +606,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; - packet->protocol = flow->key.eth.type; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 08aa926cd5cf..9d4bb8eb63f2 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb) * Returns 0 if it encounters a non-vlan or incomplete packet. * Returns 1 after successfully parsing vlan tag. */ -static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, + bool untag_vlan) { struct vlan_head *vh = (struct vlan_head *)skb->data; @@ -330,7 +331,20 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); key_vh->tpid = vh->tpid; - __skb_pull(skb, sizeof(struct vlan_head)); + if (unlikely(untag_vlan)) { + int offset = skb->data - skb_mac_header(skb); + u16 tci; + int err; + + __skb_push(skb, offset); + err = __skb_vlan_pop(skb, &tci); + __skb_pull(skb, offset); + if (err) + return err; + __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci); + } else { + __skb_pull(skb, sizeof(struct vlan_head)); + } return 1; } @@ -351,13 +365,13 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) key->eth.vlan.tpid = skb->vlan_proto; } else { /* Parse outer vlan tag in the non-accelerated case. */ - res = parse_vlan_tag(skb, &key->eth.vlan); + res = parse_vlan_tag(skb, &key->eth.vlan, true); if (res <= 0) return res; } /* Parse inner vlan tag. */ - res = parse_vlan_tag(skb, &key->eth.cvlan); + res = parse_vlan_tag(skb, &key->eth.cvlan, false); if (res <= 0) return res; @@ -751,7 +765,7 @@ static int key_extract_mac_proto(struct sk_buff *skb) int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { - int res; + int res, err; /* Extract metadata from packet. */ if (tun_info) { @@ -778,7 +792,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; - ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; res = key_extract_mac_proto(skb); if (res < 0) @@ -786,43 +799,54 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->mac_proto = res; key->recirc_id = 0; - return key_extract(skb, key); + err = key_extract(skb, key); + if (!err) + ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */ + return err; } int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; int err; + err = parse_flow_nlattrs(attr, a, &attrs, log); + if (err) + return -EINVAL; + /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(net, attr, key, log); + err = ovs_nla_get_flow_metadata(net, a, attrs, key, log); if (err) return err; - if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { - /* key_extract assumes that skb->protocol is set-up for - * layer 3 packets which is the case for other callers, - * in particular packets recieved from the network stack. - * Here the correct value can be set from the metadata - * extracted above. - */ - skb->protocol = key->eth.type; - } else { - struct ethhdr *eth; + /* key_extract assumes that skb->protocol is set-up for + * layer 3 packets which is the case for other callers, + * in particular packets received from the network stack. + * Here the correct value can be set from the metadata + * extracted above. + * For L2 packet key eth type would be zero. skb protocol + * would be set to correct value later during key-extact. + */ - skb_reset_mac_header(skb); - eth = eth_hdr(skb); + skb->protocol = key->eth.type; + err = key_extract(skb, key); + if (err) + return err; - /* Normally, setting the skb 'protocol' field would be - * handled by a call to eth_type_trans(), but it assumes - * there's a sending device, which we may not have. - */ - if (eth_proto_is_802_3(eth->h_proto)) - skb->protocol = eth->h_proto; - else - skb->protocol = htons(ETH_P_802_2); - } + /* Check that we have conntrack original direction tuple metadata only + * for packets for which it makes sense. Otherwise the key may be + * corrupted due to overlapping key fields. + */ + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) && + key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) && + (key->eth.type != htons(ETH_P_IPV6) || + sw_flow_key_is_nd(key))) + return -EINVAL; - return key_extract(skb, key); + return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index f61cae7f9030..a9bc1c875965 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2017 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -85,6 +85,11 @@ struct sw_flow_key { struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; + /* Filling a hole of two bytes. */ + u8 ct_state; + u8 ct_orig_proto; /* CT original direction tuple IP + * protocol. + */ union { struct { __be32 top_lse; /* top label stack entry */ @@ -96,6 +101,7 @@ struct sw_flow_key { u8 frag; /* One of OVS_FRAG_TYPE_*. */ } ip; }; + u16 ct_zone; /* Conntrack zone. */ struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ @@ -107,10 +113,16 @@ struct sw_flow_key { __be32 src; /* IP source address. */ __be32 dst; /* IP destination address. */ } addr; - struct { - u8 sha[ETH_ALEN]; /* ARP source hardware address. */ - u8 tha[ETH_ALEN]; /* ARP target hardware address. */ - } arp; + union { + struct { + __be32 src; + __be32 dst; + } ct_orig; /* Conntrack original direction fields. */ + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + }; } ipv4; struct { struct { @@ -118,23 +130,40 @@ struct sw_flow_key { struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ - struct { - struct in6_addr target; /* ND target address. */ - u8 sll[ETH_ALEN]; /* ND source link layer address. */ - u8 tll[ETH_ALEN]; /* ND target link layer address. */ - } nd; + union { + struct { + struct in6_addr src; + struct in6_addr dst; + } ct_orig; /* Conntrack original direction fields. */ + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + }; } ipv6; }; struct { - /* Connection tracking fields. */ - u16 zone; + /* Connection tracking fields not packed above. */ + struct { + __be16 src; /* CT orig tuple tp src port. */ + __be16 dst; /* CT orig tuple tp dst port. */ + } orig_tp; u32 mark; - u8 state; struct ovs_key_ct_labels labels; } ct; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ +static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key) +{ + return key->eth.type == htons(ETH_P_IPV6) && + key->ip.proto == NEXTHDR_ICMP && + key->tp.dst == 0 && + (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)); +} + struct sw_flow_key_range { unsigned short int start; unsigned short int end; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c87d359b9b37..6f5fa50f716d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -129,7 +129,9 @@ static bool match_validate(const struct sw_flow_match *match, /* The following mask attributes allowed only if they * pass the validation tests. */ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) | (1 << OVS_KEY_ATTR_TCP) | (1 << OVS_KEY_ATTR_TCP_FLAGS) | (1 << OVS_KEY_ATTR_UDP) @@ -161,8 +163,10 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4; + } if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { if (match->key->ip.proto == IPPROTO_UDP) { @@ -196,8 +200,10 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_IPV6)) { key_expected |= 1 << OVS_KEY_ATTR_IPV6; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6; + } if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { if (match->key->ip.proto == IPPROTO_UDP) { @@ -230,6 +236,12 @@ static bool match_validate(const struct sw_flow_match *match, htons(NDISC_NEIGHBOUR_SOLICITATION) || match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { key_expected |= 1 << OVS_KEY_ATTR_ND; + /* Original direction conntrack tuple + * uses the same space as the ND fields + * in the key, so both are not allowed + * at the same time. + */ + mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); if (match->mask && (match->mask->key.tp.src == htons(0xff))) mask_allowed |= 1 << OVS_KEY_ATTR_ND; } @@ -282,7 +294,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -295,6 +307,7 @@ size_t ovs_key_attr_size(void) + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -355,6 +368,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, }; static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) @@ -430,9 +447,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr, return __parse_flow_nlattrs(attr, a, attrsp, log, true); } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp, - bool log) +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log) { return __parse_flow_nlattrs(attr, a, attrsp, log, false); } @@ -1056,14 +1072,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return -EINVAL; } - SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); - SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && @@ -1082,6 +1098,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, sizeof(*cl), is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) { + const struct ovs_key_ct_tuple_ipv4 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]); + + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4); + } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) { + const struct ovs_key_ct_tuple_ipv6 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]); + + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src, + sizeof(match->key->ipv6.ct_orig.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst, + sizeof(match->key->ipv6.ct_orig.dst), + is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); + } /* For layer 3 packets the Ethernet type is provided * and treated as metadata but no MAC addresses are provided. @@ -1493,9 +1537,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @key: Receives extracted in_port, priority, tun_key and skb_mark. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. + * @net: Network namespace. + * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack + * metadata. + * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink + * attributes. + * @attrs: Bit mask for the netlink attributes included in @a. * @log: Boolean to allow kernel error logging. Normally true, but when * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. @@ -1504,25 +1551,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * take the same form accepted by flow_from_nlattrs(), but only enough of it to * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. + * + * This must be called before the packet key fields are filled in 'key'. */ -int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, - struct sw_flow_key *key, - bool log) +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; struct sw_flow_match match; - u64 attrs = 0; - int err; - - err = parse_flow_nlattrs(attr, a, &attrs, log); - if (err) - return -EINVAL; memset(&match, 0, sizeof(match)); match.key = key; + key->ct_state = 0; + key->ct_zone = 0; + key->ct_orig_proto = 0; memset(&key->ct, 0, sizeof(key->ct)); + memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig)); + memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig)); + key->phy.in_port = DP_MAX_PORTS; return metadata_from_nlattrs(net, &match, &attrs, a, false, log); @@ -1584,7 +1632,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; - if (ovs_ct_put_key(output, skb)) + if (ovs_ct_put_key(swkey, output, skb)) goto nla_put_failure; if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) { diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 45f9769e5aac..929c665ac3aa 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, - struct sw_flow_key *, bool log); +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log); +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d5d6caecd072..89193a634da4 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } -static struct rtnl_link_stats64 * +static void internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; @@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_bytes += local_stats.tx_bytes; stats->tx_packets += local_stats.tx_packets; } - - return stats; } static void internal_set_rx_headroom(struct net_device *dev, int new_hr) @@ -151,6 +149,8 @@ static void do_setup(struct net_device *netdev) { ether_setup(netdev); + netdev->max_mtu = ETH_MAX_MTU; + netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b9e1a13b4ba3..a0dbe7ca8f72 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: + h.h3->tp_status = status; + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return h.h2->tp_status; case TPACKET_V3: + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + return h.h3->tp_status; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: + h.h3->tp_sec = ts.tv_sec; + h.h3->tp_nsec = ts.tv_nsec; + break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -1497,6 +1505,8 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po) f->arr[f->num_members] = sk; smp_wmb(); f->num_members++; + if (f->num_members == 1) + dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } @@ -1513,6 +1523,8 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) BUG_ON(i >= f->num_members); f->arr[i] = f->arr[f->num_members - 1]; f->num_members--; + if (f->num_members == 0) + __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } @@ -1619,6 +1631,7 @@ static void fanout_release_data(struct packet_fanout *f) static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { + struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f, *match; u8 type = type_flags & 0xff; @@ -1641,23 +1654,28 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) return -EINVAL; } + mutex_lock(&fanout_mutex); + + err = -EINVAL; if (!po->running) - return -EINVAL; + goto out; + err = -EALREADY; if (po->fanout) - return -EALREADY; + goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { - po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); - if (!po->rollover) - return -ENOMEM; - atomic_long_set(&po->rollover->num, 0); - atomic_long_set(&po->rollover->num_huge, 0); - atomic_long_set(&po->rollover->num_failed, 0); + err = -ENOMEM; + rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); + if (!rollover) + goto out; + atomic_long_set(&rollover->num, 0); + atomic_long_set(&rollover->num_huge, 0); + atomic_long_set(&rollover->num_failed, 0); + po->rollover = rollover; } - mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && @@ -1687,7 +1705,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.id_match = match_fanout_group; - dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } err = -EINVAL; @@ -1704,36 +1721,40 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } } out: - mutex_unlock(&fanout_mutex); - if (err) { - kfree(po->rollover); + if (err && rollover) { + kfree(rollover); po->rollover = NULL; } + mutex_unlock(&fanout_mutex); return err; } -static void fanout_release(struct sock *sk) +/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes + * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. + * It is the responsibility of the caller to call fanout_release_data() and + * free the returned packet_fanout (after synchronize_net()) + */ +static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; + mutex_lock(&fanout_mutex); f = po->fanout; - if (!f) - return; + if (f) { + po->fanout = NULL; - mutex_lock(&fanout_mutex); - po->fanout = NULL; + if (atomic_dec_and_test(&f->sk_ref)) + list_del(&f->list); + else + f = NULL; - if (atomic_dec_and_test(&f->sk_ref)) { - list_del(&f->list); - dev_remove_pack(&f->prot_hook); - fanout_release_data(f); - kfree(f); + if (po->rollover) + kfree_rcu(po->rollover, rcu); } mutex_unlock(&fanout_mutex); - if (po->rollover) - kfree_rcu(po->rollover, rcu); + return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, @@ -1976,7 +1997,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, return -EINVAL; *len -= sizeof(vnet_hdr); - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le())) + if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); @@ -2237,7 +2258,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (po->has_vnet_hdr) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le())) { + vio_le(), true)) { spin_lock(&sk->sk_receive_queue.lock); goto drop_n_account; } @@ -2497,6 +2518,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, ph.raw = frame; switch (po->tp_version) { + case TPACKET_V3: + if (ph.h3->tp_next_offset != 0) { + pr_warn_once("variable sized slot not supported"); + return -EINVAL; + } + tp_len = ph.h3->tp_len; + break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; @@ -2516,6 +2544,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_net; + break; case TPACKET_V2: off = ph.h2->tp_net; break; @@ -2525,6 +2556,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, } } else { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_mac; + break; case TPACKET_V2: off = ph.h2->tp_mac; break; @@ -2755,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); - int hlen, tlen; + int hlen, tlen, linear; int extra_len = 0; /* @@ -2816,8 +2850,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; - skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), + linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); + linear = max(linear, min_t(int, len, dev->hard_header_len)); + skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2906,6 +2941,7 @@ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; + struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; @@ -2945,9 +2981,14 @@ static int packet_release(struct socket *sock) packet_set_ring(sk, &req_u, 1, 1); } - fanout_release(sk); + f = fanout_release(sk); synchronize_net(); + + if (f) { + fanout_release_data(f); + kfree(f); + } /* * Now the socket is dead. No more input will appear. */ @@ -3062,7 +3103,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[15]; + char name[sizeof(uaddr->sa_data) + 1]; /* * Check legality @@ -3070,7 +3111,11 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, if (addr_len != sizeof(struct sockaddr)) return -EINVAL; - strlcpy(name, uaddr->sa_data, sizeof(name)); + /* uaddr->sa_data comes from the userspace, it's not guaranteed to be + * zero-terminated. + */ + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); + name[sizeof(uaddr->sa_data)] = 0; return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } @@ -3899,7 +3944,6 @@ static int packet_notifier(struct notifier_block *this, } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); - fanout_release(sk); po->ifindex = -1; if (po->prot_hook.dev) dev_put(po->prot_hook.dev); @@ -4113,11 +4157,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct tpacket_req *req = &req_u->req; lock_sock(sk); - /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ - if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { - net_warn_ratelimited("Tx-ring is not supported.\n"); - goto out; - } rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; @@ -4177,11 +4216,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; switch (po->tp_version) { case TPACKET_V3: - /* Transmit path is not supported. We checked - * it above but just being paranoid - */ - if (!tx_ring) + /* Block transmit is not supported yet */ + if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); + } else { + struct tpacket_req3 *req3 = &req_u->req3; + + if (req3->tp_retire_blk_tov || + req3->tp_sizeof_priv || + req3->tp_feature_req_word) { + err = -EINVAL; + goto out; + } + } break; default: break; diff --git a/net/packet/diag.c b/net/packet/diag.c index 0ed68f0238bf..7ef1c881ae74 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, { struct packet_diag_ring pdr; - if (!ring->pg_vec || ((ver > TPACKET_V2) && - (nl_type == PACKET_DIAG_TX_RING))) + if (!ring->pg_vec) return 0; pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; diff --git a/net/psample/Kconfig b/net/psample/Kconfig new file mode 100644 index 000000000000..d850246a6059 --- /dev/null +++ b/net/psample/Kconfig @@ -0,0 +1,15 @@ +# +# psample packet sampling configuration +# + +menuconfig PSAMPLE + depends on NET + tristate "Packet-sampling netlink channel" + default n + help + Say Y here to add support for packet-sampling netlink channel + This netlink channel allows transferring packets alongside some + metadata to userspace. + + To compile this support as a module, choose M here: the module will + be called psample. diff --git a/net/psample/Makefile b/net/psample/Makefile new file mode 100644 index 000000000000..609b0a79c9f3 --- /dev/null +++ b/net/psample/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the psample netlink channel +# + +obj-$(CONFIG_PSAMPLE) += psample.o diff --git a/net/psample/psample.c b/net/psample/psample.c new file mode 100644 index 000000000000..8aa58a918783 --- /dev/null +++ b/net/psample/psample.c @@ -0,0 +1,301 @@ +/* + * net/psample/psample.c - Netlink channel for packet sampling + * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/module.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/psample.h> +#include <linux/spinlock.h> + +#define PSAMPLE_MAX_PACKET_SIZE 0xffff + +static LIST_HEAD(psample_groups_list); +static DEFINE_SPINLOCK(psample_groups_lock); + +/* multicast groups */ +enum psample_nl_multicast_groups { + PSAMPLE_NL_MCGRP_CONFIG, + PSAMPLE_NL_MCGRP_SAMPLE, +}; + +static const struct genl_multicast_group psample_nl_mcgrps[] = { + [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, + [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME }, +}; + +static struct genl_family psample_nl_family __ro_after_init; + +static int psample_group_nl_fill(struct sk_buff *msg, + struct psample_group *group, + enum psample_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int ret; + + hdr = genlmsg_put(msg, portid, seq, &psample_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); + if (ret < 0) + goto error; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_REFCOUNT, group->refcount); + if (ret < 0) + goto error; + + ret = nla_put_u32(msg, PSAMPLE_ATTR_GROUP_SEQ, group->seq); + if (ret < 0) + goto error; + + genlmsg_end(msg, hdr); + return 0; + +error: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct psample_group *group; + int start = cb->args[0]; + int idx = 0; + int err; + + spin_lock(&psample_groups_lock); + list_for_each_entry(group, &psample_groups_list, list) { + if (!net_eq(group->net, sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + err = psample_group_nl_fill(msg, group, PSAMPLE_CMD_NEW_GROUP, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); + if (err) + break; + idx++; + } + + spin_unlock(&psample_groups_lock); + cb->args[0] = idx; + return msg->len; +} + +static const struct genl_ops psample_nl_ops[] = { + { + .cmd = PSAMPLE_CMD_GET_GROUP, + .dumpit = psample_nl_cmd_get_group_dumpit, + /* can be retrieved by unprivileged users */ + } +}; + +static struct genl_family psample_nl_family __ro_after_init = { + .name = PSAMPLE_GENL_NAME, + .version = PSAMPLE_GENL_VERSION, + .maxattr = PSAMPLE_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .mcgrps = psample_nl_mcgrps, + .ops = psample_nl_ops, + .n_ops = ARRAY_SIZE(psample_nl_ops), + .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps), +}; + +static void psample_group_notify(struct psample_group *group, + enum psample_command cmd) +{ + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return; + + err = psample_group_nl_fill(msg, group, cmd, 0, 0, NLM_F_MULTI); + if (!err) + genlmsg_multicast_netns(&psample_nl_family, group->net, msg, 0, + PSAMPLE_NL_MCGRP_CONFIG, GFP_ATOMIC); + else + nlmsg_free(msg); +} + +static struct psample_group *psample_group_create(struct net *net, + u32 group_num) +{ + struct psample_group *group; + + group = kzalloc(sizeof(*group), GFP_ATOMIC); + if (!group) + return NULL; + + group->net = net; + group->group_num = group_num; + list_add_tail(&group->list, &psample_groups_list); + + psample_group_notify(group, PSAMPLE_CMD_NEW_GROUP); + return group; +} + +static void psample_group_destroy(struct psample_group *group) +{ + psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP); + list_del(&group->list); + kfree(group); +} + +static struct psample_group * +psample_group_lookup(struct net *net, u32 group_num) +{ + struct psample_group *group; + + list_for_each_entry(group, &psample_groups_list, list) + if ((group->group_num == group_num) && (group->net == net)) + return group; + return NULL; +} + +struct psample_group *psample_group_get(struct net *net, u32 group_num) +{ + struct psample_group *group; + + spin_lock(&psample_groups_lock); + + group = psample_group_lookup(net, group_num); + if (!group) { + group = psample_group_create(net, group_num); + if (!group) + goto out; + } + group->refcount++; + +out: + spin_unlock(&psample_groups_lock); + return group; +} +EXPORT_SYMBOL_GPL(psample_group_get); + +void psample_group_put(struct psample_group *group) +{ + spin_lock(&psample_groups_lock); + + if (--group->refcount == 0) + psample_group_destroy(group); + + spin_unlock(&psample_groups_lock); +} +EXPORT_SYMBOL_GPL(psample_group_put); + +void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, + u32 trunc_size, int in_ifindex, int out_ifindex, + u32 sample_rate) +{ + struct sk_buff *nl_skb; + int data_len; + int meta_len; + void *data; + int ret; + + meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + nla_total_size(sizeof(u32)) + /* sample_rate */ + nla_total_size(sizeof(u32)) + /* orig_size */ + nla_total_size(sizeof(u32)) + /* group_num */ + nla_total_size(sizeof(u32)); /* seq */ + + data_len = min(skb->len, trunc_size); + if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) + data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN + - NLA_ALIGNTO; + + nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC); + if (unlikely(!nl_skb)) + return; + + data = genlmsg_put(nl_skb, 0, 0, &psample_nl_family, 0, + PSAMPLE_CMD_SAMPLE); + if (unlikely(!data)) + goto error; + + if (in_ifindex) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_IIFINDEX, in_ifindex); + if (unlikely(ret < 0)) + goto error; + } + + if (out_ifindex) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OIFINDEX, out_ifindex); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_RATE, sample_rate); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_ORIGSIZE, skb->len); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_SAMPLE_GROUP, group->group_num); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u32(nl_skb, PSAMPLE_ATTR_GROUP_SEQ, group->seq++); + if (unlikely(ret < 0)) + goto error; + + if (data_len) { + int nla_len = nla_total_size(data_len); + struct nlattr *nla; + + nla = (struct nlattr *)skb_put(nl_skb, nla_len); + nla->nla_type = PSAMPLE_ATTR_DATA; + nla->nla_len = nla_attr_size(data_len); + + if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) + goto error; + } + + genlmsg_end(nl_skb, data); + genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, + PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); + + return; +error: + pr_err_ratelimited("Could not create psample log message\n"); + nlmsg_free(nl_skb); +} +EXPORT_SYMBOL_GPL(psample_sample_packet); + +static int __init psample_module_init(void) +{ + return genl_register_family(&psample_nl_family); +} + +static void __exit psample_module_exit(void) +{ + genl_unregister_family(&psample_nl_family); +} + +module_init(psample_module_init); +module_exit(psample_module_exit); + +MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>"); +MODULE_DESCRIPTION("netlink channel for packet sampling"); +MODULE_LICENSE("GPL v2"); diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c985ecbe9bd6..ae5ac175b2be 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, const int pkt_len = 20; struct qrtr_hdr *hdr; struct sk_buff *skb; - u32 *buf; + __le32 *buf; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); if (!skb) @@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, hdr->dst_node_id = cpu_to_le32(dst_node); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); - buf = (u32 *)skb_put(skb, pkt_len); + buf = (__le32 *)skb_put(skb, pkt_len); memset(buf, 0, pkt_len); buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); buf[1] = cpu_to_le32(src_node); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 2ac1e6194be3..b405f77d664c 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } +static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, + int optlen) +{ + struct rds_rx_trace_so trace; + int i; + + if (optlen != sizeof(struct rds_rx_trace_so)) + return -EFAULT; + + if (copy_from_user(&trace, optval, sizeof(trace))) + return -EFAULT; + + if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) + return -EFAULT; + + rs->rs_rx_traces = trace.rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { + rs->rs_rx_traces = 0; + return -EFAULT; + } + rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; + } + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_enable_recvtstamp(sock->sk, optval, optlen); release_sock(sock->sk); break; + case SO_RDS_MSG_RXPATH_LATENCY: + ret = rds_recv_track_latency(rs, optval, optlen); + break; default: ret = -ENOPROTOOPT; } @@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_cong_list); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; + rs->rs_rx_traces = 0; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); diff --git a/net/rds/bind.c b/net/rds/bind.c index 095f6ce583fe..3a915bedb76c 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " - "load rds_tcp or rds_rdma?\n"); + pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", + __func__, &sin->sin_addr.s_addr); goto out; } diff --git a/net/rds/connection.c b/net/rds/connection.c index fe9d31c0b22d..0e04dcceb1d4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -545,11 +545,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -void rds_walk_conn_path_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int (*visitor)(struct rds_conn_path *, void *), - size_t item_len) +static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_conn_path *, void *), + size_t item_len) { u64 buffer[(item_len + 7) / 8]; struct hlist_head *head; diff --git a/net/rds/ib.c b/net/rds/ib.c index 5680d90b0b77..0f557b243311 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -45,8 +45,8 @@ #include "ib.h" #include "ib_mr.h" -unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; -unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; +static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; module_param(rds_ib_mr_1m_pool_size, int, 0444); @@ -111,6 +111,8 @@ static void rds_ib_dev_free(struct work_struct *work) kfree(i_ipaddr); } + kfree(rds_ibdev->vector_load); + kfree(rds_ibdev); } @@ -159,6 +161,14 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; + rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors, + GFP_KERNEL); + if (!rds_ibdev->vector_load) { + pr_err("RDS/IB: %s failed to allocate vector memory\n", + __func__); + goto put_dev; + } + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { diff --git a/net/rds/ib.h b/net/rds/ib.h index 45ac8e8e58f4..ec550626e221 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -14,9 +14,10 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 -#define RDS_IB_DEFAULT_FR_WR 512 +#define RDS_IB_DEFAULT_FR_WR 256 +#define RDS_IB_DEFAULT_FR_INV_WR 256 -#define RDS_IB_DEFAULT_RETRY_COUNT 2 +#define RDS_IB_DEFAULT_RETRY_COUNT 1 #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ @@ -125,6 +126,7 @@ struct rds_ib_connection { /* To control the number of wrs from fastreg */ atomic_t i_fastreg_wrs; + atomic_t i_fastunreg_wrs; /* interrupt handling */ struct tasklet_struct i_send_tasklet; @@ -134,7 +136,7 @@ struct rds_ib_connection { struct rds_ib_work_ring i_send_ring; struct rm_data_op *i_data_op; struct rds_header *i_send_hdrs; - u64 i_send_hdrs_dma; + dma_addr_t i_send_hdrs_dma; struct rds_ib_send_work *i_sends; atomic_t i_signaled_sends; @@ -144,11 +146,12 @@ struct rds_ib_connection { struct rds_ib_incoming *i_ibinc; u32 i_recv_data_rem; struct rds_header *i_recv_hdrs; - u64 i_recv_hdrs_dma; + dma_addr_t i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; struct rds_ib_refill_cache i_cache_frags; + atomic_t i_cache_allocs; /* sending acks */ unsigned long i_ack_flags; @@ -161,7 +164,7 @@ struct rds_ib_connection { struct rds_header *i_ack; struct ib_send_wr i_ack_wr; struct ib_sge i_ack_sge; - u64 i_ack_dma; + dma_addr_t i_ack_dma; unsigned long i_ack_queued; /* Flow control related information @@ -179,6 +182,14 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; + + /* Endpoint role in connection */ + bool i_active_side; + atomic_t i_cq_quiesce; + + /* Send/Recv vectors */ + int i_scq_vector; + int i_rcq_vector; }; /* This assumes that atomic_t is at least 32 bits */ @@ -221,9 +232,10 @@ struct rds_ib_device { spinlock_t spinlock; /* protect the above */ atomic_t refcount; struct work_struct free_work; + int *vector_load; }; -#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) +#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent) #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) /* bits for i_ack_flags */ @@ -249,6 +261,8 @@ struct rds_ib_statistics { uint64_t s_ib_rx_refill_from_cq; uint64_t s_ib_rx_refill_from_thread; uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_total_frags; + uint64_t s_ib_rx_total_incs; uint64_t s_ib_rx_credit_updates; uint64_t s_ib_ack_sent; uint64_t s_ib_ack_send_failure; @@ -271,6 +285,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; + uint64_t s_ib_recv_added_to_cache; + uint64_t s_ib_recv_removed_from_cache; }; extern struct workqueue_struct *rds_ib_wq; @@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +#define rds_ib_stats_add(member, count) \ + rds_stats_add_which(rds_ib_stats, member, count) unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 5b2ab95afa07..ce3775abc6e7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } if (conn->c_version < RDS_PROTOCOL(3, 1)) { - printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," - " no longer supported\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version)); + pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); rds_conn_destroy(conn); return; } else { - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", + ic->i_active_side ? "Active" : "Passive", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); } - /* - * Init rings and fill recv. this needs to wait until protocol negotiation - * is complete, since ring layout is different from 3.0 to 3.1. + atomic_set(&ic->i_cq_quiesce, 0); + + /* Init rings and fill recv. this needs to wait until protocol + * negotiation is complete, since ring layout is different + * from 3.1 to 4.1. */ rds_ib_send_init_ring(ic); rds_ib_recv_init_ring(ic); @@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); poll_scq(ic, ic->i_send_cq, ic->i_send_wc); @@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + memset(&state, 0, sizeof(state)); poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); @@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_send_tasklet); } +static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev) +{ + int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1]; + int index = rds_ibdev->dev->num_comp_vectors - 1; + int i; + + for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) { + if (rds_ibdev->vector_load[i] < min) { + index = i; + min = rds_ibdev->vector_load[i]; + } + } + + rds_ibdev->vector_load[index]++; + return index; +} + +static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) +{ + rds_ibdev->vector_load[index]--; +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) * completion queue and send queue. This extra space is used for FRMR * registration and invalidation work requests */ - fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + fr_queue_space = rds_ibdev->use_fastreg ? + (RDS_IB_DEFAULT_FR_WR + 1) + + (RDS_IB_DEFAULT_FR_INV_WR + 1) + : 0; /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -396,25 +431,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; + ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; - + cq_attr.comp_vector = ic->i_scq_vector; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_send_cq)) { ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_scq_vector); rdsdebug("ib_create_cq send failed: %d\n", ret); goto out; } + ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_recv_ring.w_nr; + cq_attr.comp_vector = ic->i_rcq_vector; ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_recv_cq)) { ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_rcq_vector); rdsdebug("ib_create_cq recv failed: %d\n", ret); goto out; } @@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); + atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -682,6 +723,7 @@ out: if (ic->i_cm_id == cm_id) ret = 0; } + ic->i_active_side = true; return ret; } @@ -767,17 +809,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0) && - (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) && + (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); + atomic_set(&ic->i_cq_quiesce, 1); + /* first destroy the ib state that generates callbacks */ if (ic->i_cm_id->qp) rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) + if (ic->i_send_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector); ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) + } + + if (ic->i_recv_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector); ib_destroy_cq(ic->i_recv_cq); + } /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) @@ -855,6 +907,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_sends = NULL; vfree(ic->i_recvs); ic->i_recvs = NULL; + ic->i_active_side = false; } int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index d921adc62765..48332a6ed738 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) struct rds_ib_frmr *frmr = &ibmr->u.frmr; struct ib_send_wr *failed_wr; struct ib_reg_wr reg_wr; - int ret; + int ret, off = 0; while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { atomic_inc(&ibmr->ic->i_fastreg_wrs); cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + &off, PAGE_SIZE); if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; @@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (frmr->fr_state != FRMR_IS_INUSE) goto out; - while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { - atomic_inc(&ibmr->ic->i_fastreg_wrs); + while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastunreg_wrs); cpu_relax(); } @@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (unlikely(ret)) { frmr->fr_state = FRMR_IS_STALE; frmr->fr_inv = false; - atomic_inc(&ibmr->ic->i_fastreg_wrs); + atomic_inc(&ibmr->ic->i_fastunreg_wrs); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; } @@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; + atomic_inc(&ic->i_fastreg_wrs); + } else { + atomic_inc(&ic->i_fastunreg_wrs); } - - atomic_inc(&ic->i_fastreg_wrs); } void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 1c754f4acbe5..5d6e98a79a5e 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -45,7 +45,6 @@ struct rds_ib_fmr { struct ib_fmr *fmr; - u64 *dma; }; enum rds_ib_fr_state { @@ -108,8 +107,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern unsigned int rds_ib_mr_1m_pool_size; -extern unsigned int rds_ib_mr_8k_pool_size; extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 606a11f681d2..e10624aa6959 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic, rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); + atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } /* Recycle inc after freeing attached frags */ @@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i atomic_dec(&rds_ib_allocation); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); @@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); if (cache_item) { frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } else { frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); if (!frag) @@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic kmem_cache_free(rds_ib_frag_slab, frag); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_frags); } INIT_LIST_HEAD(&frag->f_item); @@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_ibinc = ibinc; hdr = &ibinc->ii_inc.i_hdr; + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); memcpy(hdr, ihdr, sizeof(*hdr)); ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, ic->i_recv_data_rem, hdr->h_flags); @@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, + rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 84d90c97332f..6ab39dbcca01 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); -} - static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + struct rds_message *rm = container_of(op, struct rds_message, data); + + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active && rm->data.op_notify) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); +} + /* * Unmap the resources associated with a struct send_work. * @@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc->status, + rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } } @@ -765,7 +770,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); if (work_alloc != 1) { - rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_stats_inc(s_ib_tx_ring_full); ret = -ENOMEM; goto out; diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 7e78dca1f252..9252ad126335 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", + "ib_rx_total_frags", + "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", diff --git a/net/rds/rdma.c b/net/rds/rdma.c index ea961144084f..f06fac4886b0 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -40,7 +40,6 @@ /* * XXX * - build with sparse - * - should we limit the size of a mr region? let transport return failure? * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ @@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* Restrict the size of mr irrespective of underlying transport + * To account for unaligned mr regions, subtract one from nr_pages + */ + if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { + ret = -EMSGSIZE; + goto out; + } + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); @@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { - printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", + r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } @@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; + + /* Enable rmda notification on data operation for composite + * rds messages and make sure notification is enabled only + * for the data operation which follows it so that application + * gets notified only after full message gets delivered. + */ + if (rm->data.op_sg) { + rm->rdma.op_notify = 0; + rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index d5f311767157..fc59821f0a27 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -206,18 +206,13 @@ static int rds_rdma_init(void) { int ret; - ret = rds_rdma_listen_init(); + ret = rds_ib_init(); if (ret) goto out; - ret = rds_ib_init(); + ret = rds_rdma_listen_init(); if (ret) - goto err_ib_init; - - goto out; - -err_ib_init: - rds_rdma_listen_stop(); + rds_ib_exit(); out: return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index ebbf909b87ec..07fff73dd4f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...) #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) +/* Used to limit both RDMA and non-RDMA RDS message to 1MB */ +#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) + #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) @@ -250,6 +253,11 @@ struct rds_ext_header_rdma_dest { #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ +#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) +#define RDS_MSG_RX_HDR 0 +#define RDS_MSG_RX_START 1 +#define RDS_MSG_RX_END 2 +#define RDS_MSG_RX_CMSG 3 struct rds_incoming { atomic_t i_refcount; @@ -262,6 +270,7 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; struct timeval i_rx_tstamp; + u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { @@ -419,6 +428,7 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; @@ -571,6 +581,10 @@ struct rds_sock { unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; + + /* Socket receive path trace points*/ + u8 rs_rx_traces; + u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -630,6 +644,9 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; + uint64_t s_recv_bytes_added_to_socket; + uint64_t s_recv_bytes_removed_from_socket; + }; /* af_rds.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 9d0666e5fe35..8b7e7b7f2c2d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,6 +43,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) { + int i; + atomic_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; @@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; + + for (i = 0; i < RDS_RX_MAX_TRACES; i++) + inc->i_rx_lat_trace[i] = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, return; rs->rs_rcv_bytes += delta; + if (delta > 0) + rds_stats_add(s_recv_bytes_added_to_socket, delta); + else + rds_stats_add(s_recv_bytes_removed_from_socket, -delta); now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " @@ -369,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (sock_flag(sk, SOCK_RCVTSTAMP)) do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); + inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { @@ -530,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); if (ret) - return ret; + goto out; } if ((inc->i_rx_tstamp.tv_sec != 0) && @@ -539,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, sizeof(struct timeval), &inc->i_rx_tstamp); if (ret) - return ret; + goto out; } - return 0; + if (rs->rs_rx_traces) { + struct rds_cmsg_rx_trace t; + int i, j; + + inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); + t.rx_traces = rs->rs_rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + j = rs->rs_rx_trace[i]; + t.rx_trace_pos[i] = j; + t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - + inc->i_rx_lat_trace[j]; + } + + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, + sizeof(t), &t); + if (ret) + goto out; + } + +out: + return ret; } int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/rds/send.c b/net/rds/send.c index 77c8c6e613ad..5cc64039caf7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; + unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); + notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -945,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; + else if (ret == -ENODEV) + /* Accommodate the get_mr() case which can fail + * if connection isn't established yet. + */ + ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: @@ -987,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) return hash; } +static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) +{ + struct rds_rdma_args *args; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + args = CMSG_DATA(cmsg); + *rdma_bytes += args->remote_vec.bytes; + } + } + return 0; +} + int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; @@ -1001,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; + size_t total_payload_len = payload_len, rdma_payload_len = 0; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1033,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } release_sock(sk); + ret = rds_rdma_bytes(msg, &rdma_payload_len); + if (ret) + goto out; + + total_payload_len += rdma_payload_len; + if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { + ret = -EMSGSIZE; + goto out; + } + if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; @@ -1082,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) + if (ret) { + /* Trigger connection so that its ready for the next retry */ + if (ret == -EAGAIN) + rds_conn_connect_if_down(conn); goto out; + } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", @@ -1169,7 +1211,7 @@ out: * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ -int +static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { @@ -1238,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport) return rds_send_probe(cp, 0, dport, 0); } -void +static void rds_send_ping(struct rds_connection *conn) { unsigned long flags; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 57bb52361e0f..5438f6725092 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -641,12 +641,12 @@ static int rds_tcp_init(void) ret = register_netdevice_notifier(&rds_tcp_dev_notifier); if (ret) { pr_warn("could not register rds_tcp_dev_notifier\n"); - goto out; + goto out_slab; } ret = register_pernet_subsys(&rds_tcp_net_ops); if (ret) - goto out_slab; + goto out_notifier; ret = rds_tcp_recv_init(); if (ret) @@ -664,9 +664,10 @@ out_recv: rds_tcp_recv_exit(); out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); -out_slab: +out_notifier: if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) pr_warn("could not unregister rds_tcp_dev_notifier\n"); +out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f74bab3ecdca..67d0929c7d3d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -79,6 +79,7 @@ bail: * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side * by moving them to CONNECTING in this function. */ +static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index ad4892e97f91..e006ef8e6d40 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, cp->cp_conn->c_faddr); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); + /* * XXX * we might be able to use the __ variants when * we've already serialized at a higher level. @@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, /* could be 0 for a 0 len message */ tc->t_tinc_data_rem = be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); } } diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig index 868f1ad0415a..060600b03fad 100644 --- a/net/rfkill/Kconfig +++ b/net/rfkill/Kconfig @@ -23,17 +23,6 @@ config RFKILL_INPUT depends on INPUT = y || RFKILL = INPUT default y if !EXPERT -config RFKILL_REGULATOR - tristate "Generic rfkill regulator driver" - depends on RFKILL || !RFKILL - depends on REGULATOR - help - This options enable controlling radio transmitters connected to - voltage regulator using the regulator framework. - - To compile this driver as a module, choose M here: the module will - be called rfkill-regulator. - config RFKILL_GPIO tristate "GPIO RFKILL driver" depends on RFKILL diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile index 311768783f4a..87a80aded0b3 100644 --- a/net/rfkill/Makefile +++ b/net/rfkill/Makefile @@ -5,5 +5,4 @@ rfkill-y += core.o rfkill-$(CONFIG_RFKILL_INPUT) += input.o obj-$(CONFIG_RFKILL) += rfkill.o -obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 884027f62783..2064c3a35ef8 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill) { led_trigger_unregister(&rfkill->led_trigger); } + +static struct led_trigger rfkill_any_led_trigger; +static struct work_struct rfkill_any_work; + +static void rfkill_any_led_trigger_worker(struct work_struct *work) +{ + enum led_brightness brightness = LED_OFF; + struct rfkill *rfkill; + + mutex_lock(&rfkill_global_mutex); + list_for_each_entry(rfkill, &rfkill_list, node) { + if (!(rfkill->state & RFKILL_BLOCK_ANY)) { + brightness = LED_FULL; + break; + } + } + mutex_unlock(&rfkill_global_mutex); + + led_trigger_event(&rfkill_any_led_trigger, brightness); +} + +static void rfkill_any_led_trigger_event(void) +{ + schedule_work(&rfkill_any_work); +} + +static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev) +{ + rfkill_any_led_trigger_event(); +} + +static int rfkill_any_led_trigger_register(void) +{ + INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker); + rfkill_any_led_trigger.name = "rfkill-any"; + rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate; + return led_trigger_register(&rfkill_any_led_trigger); +} + +static void rfkill_any_led_trigger_unregister(void) +{ + led_trigger_unregister(&rfkill_any_led_trigger); + cancel_work_sync(&rfkill_any_work); +} #else static void rfkill_led_trigger_event(struct rfkill *rfkill) { @@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill) static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) { } + +static void rfkill_any_led_trigger_event(void) +{ +} + +static int rfkill_any_led_trigger_register(void) +{ + return 0; +} + +static void rfkill_any_led_trigger_unregister(void) +{ +} #endif /* CONFIG_RFKILL_LEDS */ static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, @@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); if (prev != curr) rfkill_event(rfkill); @@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); - if (!rfkill->registered) - return ret; - - if (prev != blocked) + if (rfkill->registered && prev != blocked) schedule_work(&rfkill->uevent_work); return ret; @@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); return blocked; } @@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); } } EXPORT_SYMBOL(rfkill_set_states); @@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill) #endif } + rfkill_any_led_trigger_event(); rfkill_send_events(rfkill, RFKILL_OP_ADD); mutex_unlock(&rfkill_global_mutex); @@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill) mutex_lock(&rfkill_global_mutex); rfkill_send_events(rfkill, RFKILL_OP_DEL); list_del_init(&rfkill->node); + rfkill_any_led_trigger_event(); mutex_unlock(&rfkill_global_mutex); rfkill_led_trigger_unregister(rfkill); @@ -1266,24 +1326,33 @@ static int __init rfkill_init(void) error = class_register(&rfkill_class); if (error) - goto out; + goto error_class; error = misc_register(&rfkill_miscdev); - if (error) { - class_unregister(&rfkill_class); - goto out; - } + if (error) + goto error_misc; + + error = rfkill_any_led_trigger_register(); + if (error) + goto error_led_trigger; #ifdef CONFIG_RFKILL_INPUT error = rfkill_handler_init(); - if (error) { - misc_deregister(&rfkill_miscdev); - class_unregister(&rfkill_class); - goto out; - } + if (error) + goto error_input; #endif - out: + return 0; + +#ifdef CONFIG_RFKILL_INPUT +error_input: + rfkill_any_led_trigger_unregister(); +#endif +error_led_trigger: + misc_deregister(&rfkill_miscdev); +error_misc: + class_unregister(&rfkill_class); +error_class: return error; } subsys_initcall(rfkill_init); @@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void) #ifdef CONFIG_RFKILL_INPUT rfkill_handler_exit(); #endif + rfkill_any_led_trigger_unregister(); misc_deregister(&rfkill_miscdev); class_unregister(&rfkill_class); } diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c deleted file mode 100644 index 50cd26a48e87..000000000000 --- a/net/rfkill/rfkill-regulator.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * rfkill-regulator.c - Regulator consumer driver for rfkill - * - * Copyright (C) 2009 Guiming Zhuo <gmzhuo@gmail.com> - * Copyright (C) 2011 Antonio Ospite <ospite@studenti.unina.it> - * - * Implementation inspired by leds-regulator driver. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/platform_device.h> -#include <linux/regulator/consumer.h> -#include <linux/rfkill.h> -#include <linux/rfkill-regulator.h> - -struct rfkill_regulator_data { - struct rfkill *rf_kill; - bool reg_enabled; - - struct regulator *vcc; -}; - -static int rfkill_regulator_set_block(void *data, bool blocked) -{ - struct rfkill_regulator_data *rfkill_data = data; - int ret = 0; - - pr_debug("%s: blocked: %d\n", __func__, blocked); - - if (blocked) { - if (rfkill_data->reg_enabled) { - regulator_disable(rfkill_data->vcc); - rfkill_data->reg_enabled = false; - } - } else { - if (!rfkill_data->reg_enabled) { - ret = regulator_enable(rfkill_data->vcc); - if (!ret) - rfkill_data->reg_enabled = true; - } - } - - pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__, - regulator_is_enabled(rfkill_data->vcc)); - - return ret; -} - -static struct rfkill_ops rfkill_regulator_ops = { - .set_block = rfkill_regulator_set_block, -}; - -static int rfkill_regulator_probe(struct platform_device *pdev) -{ - struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data; - struct rfkill_regulator_data *rfkill_data; - struct regulator *vcc; - struct rfkill *rf_kill; - int ret = 0; - - if (pdata == NULL) { - dev_err(&pdev->dev, "no platform data\n"); - return -ENODEV; - } - - if (pdata->name == NULL || pdata->type == 0) { - dev_err(&pdev->dev, "invalid name or type in platform data\n"); - return -EINVAL; - } - - vcc = regulator_get_exclusive(&pdev->dev, "vrfkill"); - if (IS_ERR(vcc)) { - dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name); - ret = PTR_ERR(vcc); - goto out; - } - - rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL); - if (rfkill_data == NULL) { - ret = -ENOMEM; - goto err_data_alloc; - } - - rf_kill = rfkill_alloc(pdata->name, &pdev->dev, - pdata->type, - &rfkill_regulator_ops, rfkill_data); - if (rf_kill == NULL) { - ret = -ENOMEM; - goto err_rfkill_alloc; - } - - if (regulator_is_enabled(vcc)) { - dev_dbg(&pdev->dev, "Regulator already enabled\n"); - rfkill_data->reg_enabled = true; - } - rfkill_data->vcc = vcc; - rfkill_data->rf_kill = rf_kill; - - ret = rfkill_register(rf_kill); - if (ret) { - dev_err(&pdev->dev, "Cannot register rfkill device\n"); - goto err_rfkill_register; - } - - platform_set_drvdata(pdev, rfkill_data); - dev_info(&pdev->dev, "%s initialized\n", pdata->name); - - return 0; - -err_rfkill_register: - rfkill_destroy(rf_kill); -err_rfkill_alloc: - kfree(rfkill_data); -err_data_alloc: - regulator_put(vcc); -out: - return ret; -} - -static int rfkill_regulator_remove(struct platform_device *pdev) -{ - struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev); - struct rfkill *rf_kill = rfkill_data->rf_kill; - - rfkill_unregister(rf_kill); - rfkill_destroy(rf_kill); - regulator_put(rfkill_data->vcc); - kfree(rfkill_data); - - return 0; -} - -static struct platform_driver rfkill_regulator_driver = { - .probe = rfkill_regulator_probe, - .remove = rfkill_regulator_remove, - .driver = { - .name = "rfkill-regulator", - }, -}; - -module_platform_driver(rfkill_regulator_driver); - -MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>"); -MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>"); -MODULE_DESCRIPTION("Regulator consumer driver for rfkill"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:rfkill-regulator"); diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 8fc6ea347182..b9da4d6b914f 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -2,7 +2,9 @@ # Makefile for Linux kernel RxRPC # -af-rxrpc-y := \ +obj-$(CONFIG_AF_RXRPC) += rxrpc.o + +rxrpc-y := \ af_rxrpc.o \ call_accept.o \ call_event.o \ @@ -26,8 +28,6 @@ af-rxrpc-y := \ skbuff.o \ utils.o -af-rxrpc-$(CONFIG_PROC_FS) += proc.o -af-rxrpc-$(CONFIG_RXKAD) += rxkad.o -af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o - -obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o +rxrpc-$(CONFIG_PROC_FS) += proc.o +rxrpc-$(CONFIG_RXKAD) += rxkad.o +rxrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5f63f6dcaabb..7fb59c3f1542 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog) else sk->sk_max_ack_backlog = old; break; + case RXRPC_SERVER_LISTENING: + if (backlog == 0) { + rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED; + sk->sk_max_ack_backlog = 0; + rxrpc_discard_prealloc(rx); + ret = 0; + break; + } default: ret = -EBUSY; break; @@ -282,10 +290,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); + /* The socket has been unlocked. */ if (!IS_ERR(call)) call->notify_rx = notify_rx; - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); _leave(" = %p", call); return call; } @@ -302,7 +311,10 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call); void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + + mutex_lock(&call->user_mutex); rxrpc_release_call(rxrpc_sk(sock->sk), call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_end_call); @@ -442,14 +454,16 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: ret = rxrpc_do_sendmsg(rx, m, len); - break; + /* The socket has been unlocked */ + goto out; default: ret = -EINVAL; - break; + goto error_unlock; } error_unlock: release_sock(&rx->sk); +out: _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f60e35576526..26a7b1db1361 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -60,6 +60,7 @@ enum { RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ + RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ }; @@ -466,6 +467,7 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ + struct mutex user_mutex; /* User access mutex */ ktime_t ack_at; /* When deferred ACK needs to happen */ ktime_t resend_at; /* When next resend needs to happen */ ktime_t ping_at; /* When next to send a ping */ @@ -593,200 +595,6 @@ struct rxrpc_ack_summary { u8 cumulative_acks; }; -enum rxrpc_skb_trace { - rxrpc_skb_rx_cleaned, - rxrpc_skb_rx_freed, - rxrpc_skb_rx_got, - rxrpc_skb_rx_lost, - rxrpc_skb_rx_received, - rxrpc_skb_rx_rotated, - rxrpc_skb_rx_purged, - rxrpc_skb_rx_seen, - rxrpc_skb_tx_cleaned, - rxrpc_skb_tx_freed, - rxrpc_skb_tx_got, - rxrpc_skb_tx_new, - rxrpc_skb_tx_rotated, - rxrpc_skb_tx_seen, - rxrpc_skb__nr_trace -}; - -extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7]; - -enum rxrpc_conn_trace { - rxrpc_conn_new_client, - rxrpc_conn_new_service, - rxrpc_conn_queued, - rxrpc_conn_seen, - rxrpc_conn_got, - rxrpc_conn_put_client, - rxrpc_conn_put_service, - rxrpc_conn__nr_trace -}; - -extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4]; - -enum rxrpc_client_trace { - rxrpc_client_activate_chans, - rxrpc_client_alloc, - rxrpc_client_chan_activate, - rxrpc_client_chan_disconnect, - rxrpc_client_chan_pass, - rxrpc_client_chan_unstarted, - rxrpc_client_cleanup, - rxrpc_client_count, - rxrpc_client_discard, - rxrpc_client_duplicate, - rxrpc_client_exposed, - rxrpc_client_replace, - rxrpc_client_to_active, - rxrpc_client_to_culled, - rxrpc_client_to_idle, - rxrpc_client_to_inactive, - rxrpc_client_to_waiting, - rxrpc_client_uncount, - rxrpc_client__nr_trace -}; - -extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7]; -extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5]; - -enum rxrpc_call_trace { - rxrpc_call_new_client, - rxrpc_call_new_service, - rxrpc_call_queued, - rxrpc_call_queued_ref, - rxrpc_call_seen, - rxrpc_call_connected, - rxrpc_call_release, - rxrpc_call_got, - rxrpc_call_got_userid, - rxrpc_call_got_kernel, - rxrpc_call_put, - rxrpc_call_put_userid, - rxrpc_call_put_kernel, - rxrpc_call_put_noqueue, - rxrpc_call_error, - rxrpc_call__nr_trace -}; - -extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; - -enum rxrpc_transmit_trace { - rxrpc_transmit_wait, - rxrpc_transmit_queue, - rxrpc_transmit_queue_last, - rxrpc_transmit_rotate, - rxrpc_transmit_rotate_last, - rxrpc_transmit_await_reply, - rxrpc_transmit_end, - rxrpc_transmit__nr_trace -}; - -extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; - -enum rxrpc_receive_trace { - rxrpc_receive_incoming, - rxrpc_receive_queue, - rxrpc_receive_queue_last, - rxrpc_receive_front, - rxrpc_receive_rotate, - rxrpc_receive_end, - rxrpc_receive__nr_trace -}; - -extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; - -enum rxrpc_recvmsg_trace { - rxrpc_recvmsg_enter, - rxrpc_recvmsg_wait, - rxrpc_recvmsg_dequeue, - rxrpc_recvmsg_hole, - rxrpc_recvmsg_next, - rxrpc_recvmsg_cont, - rxrpc_recvmsg_full, - rxrpc_recvmsg_data_return, - rxrpc_recvmsg_terminal, - rxrpc_recvmsg_to_be_accepted, - rxrpc_recvmsg_return, - rxrpc_recvmsg__nr_trace -}; - -extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; - -enum rxrpc_rtt_tx_trace { - rxrpc_rtt_tx_ping, - rxrpc_rtt_tx_data, - rxrpc_rtt_tx__nr_trace -}; - -extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; - -enum rxrpc_rtt_rx_trace { - rxrpc_rtt_rx_ping_response, - rxrpc_rtt_rx_requested_ack, - rxrpc_rtt_rx__nr_trace -}; - -extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; - -enum rxrpc_timer_trace { - rxrpc_timer_begin, - rxrpc_timer_init_for_reply, - rxrpc_timer_init_for_send_reply, - rxrpc_timer_expired, - rxrpc_timer_set_for_ack, - rxrpc_timer_set_for_ping, - rxrpc_timer_set_for_resend, - rxrpc_timer_set_for_send, - rxrpc_timer__nr_trace -}; - -extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; - -enum rxrpc_propose_ack_trace { - rxrpc_propose_ack_client_tx_end, - rxrpc_propose_ack_input_data, - rxrpc_propose_ack_ping_for_lost_ack, - rxrpc_propose_ack_ping_for_lost_reply, - rxrpc_propose_ack_ping_for_params, - rxrpc_propose_ack_processing_op, - rxrpc_propose_ack_respond_to_ack, - rxrpc_propose_ack_respond_to_ping, - rxrpc_propose_ack_retry_tx, - rxrpc_propose_ack_rotate_rx, - rxrpc_propose_ack_terminal_ack, - rxrpc_propose_ack__nr_trace -}; - -enum rxrpc_propose_ack_outcome { - rxrpc_propose_ack_use, - rxrpc_propose_ack_update, - rxrpc_propose_ack_subsume, - rxrpc_propose_ack__nr_outcomes -}; - -extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; -extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; - -enum rxrpc_congest_change { - rxrpc_cong_begin_retransmission, - rxrpc_cong_cleared_nacks, - rxrpc_cong_new_low_nack, - rxrpc_cong_no_change, - rxrpc_cong_progress, - rxrpc_cong_retransmit_again, - rxrpc_cong_rtt_window_end, - rxrpc_cong_saw_nack, - rxrpc_congest__nr_change -}; - -extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; -extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; - -extern const char *const rxrpc_pkts[]; -extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; - #include <trace/events/rxrpc.h> /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 832d854c2d5c..0ed181f53f32 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -323,6 +323,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * * If we want to report an error, we mark the skb with the packet type and * abort code and return NULL. + * + * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, @@ -349,7 +351,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, found_service: spin_lock(&rx->incoming_lock); - if (rx->sk.sk_state == RXRPC_CLOSE) { + if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || + rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; @@ -370,6 +373,18 @@ found_service: trace_rxrpc_receive(call, rxrpc_receive_incoming, sp->hdr.serial, sp->hdr.seq); + /* Lock the call to prevent rxrpc_kernel_send/recv_data() and + * sendmsg()/recvmsg() inconveniently stealing the mutex once the + * notification is generated. + * + * The BUG should never happen because the kernel should be well + * behaved enough not to access the call before the first notification + * event and userspace is prevented from doing so until the state is + * appropriate. + */ + if (!mutex_trylock(&call->user_mutex)) + BUG(); + /* Make the call live. */ rxrpc_incoming_call(rx, call, skb); conn = call->conn; @@ -428,10 +443,12 @@ out: /* * handle acceptance of a call by userspace * - assign the user call ID to the call at the front of the queue + * - called with the socket locked. */ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, unsigned long user_call_ID, rxrpc_notify_rx_t notify_rx) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -445,6 +462,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, if (list_empty(&rx->to_be_accepted)) { write_unlock(&rx->call_lock); + release_sock(&rx->sk); kleave(" = -ENODATA [empty]"); return ERR_PTR(-ENODATA); } @@ -469,10 +487,39 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, */ call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); + write_unlock(&rx->call_lock); + + /* We need to gain the mutex from the interrupt handler without + * upsetting lockdep, so we have to release it there and take it here. + * We are, however, still holding the socket lock, so other accepts + * must wait for us and no one can add the user ID behind our backs. + */ + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + release_sock(&rx->sk); + kleave(" = -ERESTARTSYS"); + return ERR_PTR(-ERESTARTSYS); + } + + write_lock(&rx->call_lock); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); rxrpc_see_call(call); + /* Find the user ID insertion point. */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + BUG(); + } + write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: @@ -498,6 +545,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxrpc_notify_socket(call); rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %p{%d}", call, call->debug_id); return call; @@ -514,6 +562,7 @@ id_in_use: write_unlock(&rx->call_lock); out: rxrpc_service_prealloc(rx, GFP_KERNEL); + release_sock(&rx->sk); _leave(" = %d", ret); return ERR_PTR(ret); } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1ed18d8c9c9f..d79cd36987a9 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; -const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { - [rxrpc_call_new_client] = "NWc", - [rxrpc_call_new_service] = "NWs", - [rxrpc_call_queued] = "QUE", - [rxrpc_call_queued_ref] = "QUR", - [rxrpc_call_connected] = "CON", - [rxrpc_call_release] = "RLS", - [rxrpc_call_seen] = "SEE", - [rxrpc_call_got] = "GOT", - [rxrpc_call_got_userid] = "Gus", - [rxrpc_call_got_kernel] = "Gke", - [rxrpc_call_put] = "PUT", - [rxrpc_call_put_userid] = "Pus", - [rxrpc_call_put_kernel] = "Pke", - [rxrpc_call_put_noqueue] = "PNQ", - [rxrpc_call_error] = "*E*", -}; - struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); @@ -133,6 +115,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) if (!call->rxtx_annotations) goto nomem_2; + mutex_init(&call->user_mutex); setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -212,14 +195,16 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) } /* - * set up a call for the given data - * - called in process context with IRQs enabled + * Set up a call for the given parameters. + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, unsigned long user_call_ID, gfp_t gfp) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; @@ -230,6 +215,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, call = rxrpc_alloc_client_call(srx, gfp); if (IS_ERR(call)) { + release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); return call; } @@ -237,6 +223,11 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), here, (const void *)user_call_ID); + /* We need to protect a partially set up call against the user as we + * will be acting outside the socket lock. + */ + mutex_lock(&call->user_mutex); + /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); @@ -268,6 +259,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, list_add_tail(&call->link, &rxrpc_calls); write_unlock(&rxrpc_call_lock); + /* From this point on, the call is protected by its own lock. */ + release_sock(&rx->sk); + /* Set up or get a connection record and set the protocol parameters, * including channel number and call ID. */ @@ -297,6 +291,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, */ error_dup_user_ID: write_unlock(&rx->call_lock); + release_sock(&rx->sk); ret = -EEXIST; error: @@ -305,6 +300,7 @@ error: trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ERR_PTR(ret); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 6cbcdcc29853..40a1ef2adeb4 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -105,14 +105,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *); static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, rxrpc_discard_expired_client_conns); -const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = { - [RXRPC_CONN_CLIENT_INACTIVE] = "Inac", - [RXRPC_CONN_CLIENT_WAITING] = "Wait", - [RXRPC_CONN_CLIENT_ACTIVE] = "Actv", - [RXRPC_CONN_CLIENT_CULLED] = "Cull", - [RXRPC_CONN_CLIENT_IDLE] = "Idle", -}; - /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index e1e83af47866..b0ecb770fdce 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ + trace_rxrpc_disconnect_call(call); chan->last_service_id = call->service_id; if (call->abort_code) { chan->last_abort = call->abort_code; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1d87b5453ef7..9f4cfa25af7c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -481,6 +481,7 @@ next_subpacket: return rxrpc_proto_abort("LSA", call, seq); } + trace_rxrpc_rx_data(call, seq, serial, flags, annotation); if (before_eq(seq, hard_ack)) { ack = RXRPC_ACK_DUPLICATE; ack_serial = serial; @@ -765,16 +766,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); - - _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - sp->hdr.serial, - ntohs(buf.ack.maxSkew), - first_soft_ack, - ntohl(buf.ack.previousPacket), - acked_serial, - rxrpc_ack_names[summary.ack_reason], - buf.ack.nAcks); + trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial, + first_soft_ack, ntohl(buf.ack.previousPacket), + summary.ack_reason, nr_acks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) rxrpc_input_ping_response(call, skb->tstamp, acked_serial, @@ -931,7 +925,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, break; default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); break; } @@ -961,6 +954,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, break; } + trace_rxrpc_improper_term(call); __rxrpc_disconnect_call(conn, call); rxrpc_notify_socket(call); } @@ -1200,6 +1194,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto reject_packet; } rxrpc_send_ping(call, skb, skew); + mutex_unlock(&call->user_mutex); } rxrpc_input_call_packet(call, skb, skew); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 18c737a61d80..0a4e28477ad9 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key, switch (token->security_index) { case RXRPC_SECURITY_RXKAD: - toksize += 8 * 4; /* viceid, kvno, key*2, begin, + toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin, * end, primary, tktlen */ toksize += RND(token->kad->ticket_len); break; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 6dee55fad2d3..1a2d4b112064 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4; */ unsigned int rxrpc_resend_timeout = 4 * 1000; -const char *const rxrpc_pkts[] = { - "?00", - "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" -}; - const s8 rxrpc_ack_priority[] = { [0] = 0, [RXRPC_ACK_DELAY] = 1, @@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_NOSPACE] = 7, [RXRPC_ACK_PING_RESPONSE] = 8, }; - -const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" -}; - -const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { - [rxrpc_skb_rx_cleaned] = "Rx CLN", - [rxrpc_skb_rx_freed] = "Rx FRE", - [rxrpc_skb_rx_got] = "Rx GOT", - [rxrpc_skb_rx_lost] = "Rx *L*", - [rxrpc_skb_rx_received] = "Rx RCV", - [rxrpc_skb_rx_purged] = "Rx PUR", - [rxrpc_skb_rx_rotated] = "Rx ROT", - [rxrpc_skb_rx_seen] = "Rx SEE", - [rxrpc_skb_tx_cleaned] = "Tx CLN", - [rxrpc_skb_tx_freed] = "Tx FRE", - [rxrpc_skb_tx_got] = "Tx GOT", - [rxrpc_skb_tx_new] = "Tx NEW", - [rxrpc_skb_tx_rotated] = "Tx ROT", - [rxrpc_skb_tx_seen] = "Tx SEE", -}; - -const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { - [rxrpc_conn_new_client] = "NWc", - [rxrpc_conn_new_service] = "NWs", - [rxrpc_conn_queued] = "QUE", - [rxrpc_conn_seen] = "SEE", - [rxrpc_conn_got] = "GOT", - [rxrpc_conn_put_client] = "PTc", - [rxrpc_conn_put_service] = "PTs", -}; - -const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { - [rxrpc_client_activate_chans] = "Activa", - [rxrpc_client_alloc] = "Alloc ", - [rxrpc_client_chan_activate] = "ChActv", - [rxrpc_client_chan_disconnect] = "ChDisc", - [rxrpc_client_chan_pass] = "ChPass", - [rxrpc_client_chan_unstarted] = "ChUnst", - [rxrpc_client_cleanup] = "Clean ", - [rxrpc_client_count] = "Count ", - [rxrpc_client_discard] = "Discar", - [rxrpc_client_duplicate] = "Duplic", - [rxrpc_client_exposed] = "Expose", - [rxrpc_client_replace] = "Replac", - [rxrpc_client_to_active] = "->Actv", - [rxrpc_client_to_culled] = "->Cull", - [rxrpc_client_to_idle] = "->Idle", - [rxrpc_client_to_inactive] = "->Inac", - [rxrpc_client_to_waiting] = "->Wait", - [rxrpc_client_uncount] = "Uncoun", -}; - -const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { - [rxrpc_transmit_wait] = "WAI", - [rxrpc_transmit_queue] = "QUE", - [rxrpc_transmit_queue_last] = "QLS", - [rxrpc_transmit_rotate] = "ROT", - [rxrpc_transmit_rotate_last] = "RLS", - [rxrpc_transmit_await_reply] = "AWR", - [rxrpc_transmit_end] = "END", -}; - -const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { - [rxrpc_receive_incoming] = "INC", - [rxrpc_receive_queue] = "QUE", - [rxrpc_receive_queue_last] = "QLS", - [rxrpc_receive_front] = "FRN", - [rxrpc_receive_rotate] = "ROT", - [rxrpc_receive_end] = "END", -}; - -const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { - [rxrpc_recvmsg_enter] = "ENTR", - [rxrpc_recvmsg_wait] = "WAIT", - [rxrpc_recvmsg_dequeue] = "DEQU", - [rxrpc_recvmsg_hole] = "HOLE", - [rxrpc_recvmsg_next] = "NEXT", - [rxrpc_recvmsg_cont] = "CONT", - [rxrpc_recvmsg_full] = "FULL", - [rxrpc_recvmsg_data_return] = "DATA", - [rxrpc_recvmsg_terminal] = "TERM", - [rxrpc_recvmsg_to_be_accepted] = "TBAC", - [rxrpc_recvmsg_return] = "RETN", -}; - -const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { - [rxrpc_rtt_tx_ping] = "PING", - [rxrpc_rtt_tx_data] = "DATA", -}; - -const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { - [rxrpc_rtt_rx_ping_response] = "PONG", - [rxrpc_rtt_rx_requested_ack] = "RACK", -}; - -const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { - [rxrpc_timer_begin] = "Begin ", - [rxrpc_timer_expired] = "*EXPR*", - [rxrpc_timer_init_for_reply] = "IniRpl", - [rxrpc_timer_init_for_send_reply] = "SndRpl", - [rxrpc_timer_set_for_ack] = "SetAck", - [rxrpc_timer_set_for_ping] = "SetPng", - [rxrpc_timer_set_for_send] = "SetTx ", - [rxrpc_timer_set_for_resend] = "SetRTx", -}; - -const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { - [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", - [rxrpc_propose_ack_input_data] = "DataIn ", - [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", - [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", - [rxrpc_propose_ack_ping_for_params] = "Params ", - [rxrpc_propose_ack_processing_op] = "ProcOp ", - [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", - [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", - [rxrpc_propose_ack_retry_tx] = "RetryTx", - [rxrpc_propose_ack_rotate_rx] = "RxAck ", - [rxrpc_propose_ack_terminal_ack] = "ClTerm ", -}; - -const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { - [rxrpc_propose_ack_use] = "", - [rxrpc_propose_ack_update] = " Update", - [rxrpc_propose_ack_subsume] = " Subsume", -}; - -const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = { - [RXRPC_CALL_SLOW_START] = "SlowStart", - [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid", - [RXRPC_CALL_PACKET_LOSS] = "PktLoss ", - [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ", -}; - -const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = { - [rxrpc_cong_begin_retransmission] = " Retrans", - [rxrpc_cong_cleared_nacks] = " Cleared", - [rxrpc_cong_new_low_nack] = " NewLowN", - [rxrpc_cong_no_change] = "", - [rxrpc_cong_progress] = " Progres", - [rxrpc_cong_retransmit_again] = " ReTxAgn", - [rxrpc_cong_rtt_window_end] = " RttWinE", - [rxrpc_cong_saw_nack] = " SawNack", -}; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 65cd980767fa..b9bcfbfb095c 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_sock *rx; struct rxrpc_peer *peer; struct rxrpc_call *call; + rxrpc_seq_t tx_hard_ack, rx_hard_ack; char lbuff[50], rbuff[50]; if (v == &rxrpc_calls) { @@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) else strcpy(rbuff, "no_connection"); + tx_hard_ack = READ_ONCE(call->tx_hard_ack); + rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx\n", + " %-8.8s %08x %lx %08x %02x %08x %02x\n", lbuff, rbuff, call->service_id, @@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) atomic_read(&call->usage), rxrpc_call_states[call->state], call->abort_code, - call->user_call_ID); + call->user_call_ID, + tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, + rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack); return 0; } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c29362d50a92..22447dbcc380 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -320,8 +320,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; - top = smp_load_acquire(&call->rx_top); - for (seq = hard_ack + 1; before_eq(seq, top); seq++) { + seq = hard_ack + 1; + while (top = smp_load_acquire(&call->rx_top), + before_eq(seq, top) + ) { ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; if (!skb) { @@ -394,6 +396,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret = 1; goto out; } + + seq++; } out: @@ -483,6 +487,20 @@ try_again: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); + /* We're going to drop the socket lock, so we need to lock the call + * against interference by sendmsg. + */ + if (!mutex_trylock(&call->user_mutex)) { + ret = -EWOULDBLOCK; + if (flags & MSG_DONTWAIT) + goto error_requeue_call; + ret = -ERESTARTSYS; + if (mutex_lock_interruptible(&call->user_mutex) < 0) + goto error_requeue_call; + } + + release_sock(&rx->sk); + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); @@ -498,7 +516,7 @@ try_again: &call->user_call_ID); } if (ret < 0) - goto error; + goto error_unlock_call; } if (msg->msg_name) { @@ -529,12 +547,12 @@ try_again: } if (ret < 0) - goto error; + goto error_unlock_call; if (call->state == RXRPC_CALL_COMPLETE) { ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) - goto error; + goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; @@ -547,8 +565,21 @@ try_again: msg->msg_flags &= ~MSG_MORE; ret = copied; -error: +error_unlock_call: + mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); + return ret; + +error_requeue_call: + if (!(flags & MSG_PEEK)) { + write_lock_bh(&rx->recvmsg_lock); + list_add(&call->recvmsg_link, &rx->recvmsg_q); + write_unlock_bh(&rx->recvmsg_lock); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0); + } else { + rxrpc_put_call(call, rxrpc_call_put); + } error_no_call: release_sock(&rx->sk); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); @@ -605,7 +636,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, iov.iov_len = size - *_offset; iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: @@ -644,7 +675,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); return ret; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index b214a4d4a641..31c1538c1a8d 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -59,9 +59,12 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, } trace_rxrpc_transmit(call, rxrpc_transmit_wait); - release_sock(&rx->sk); + mutex_unlock(&call->user_mutex); *timeo = schedule_timeout(*timeo); - lock_sock(&rx->sk); + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + ret = sock_intr_errno(*timeo); + break; + } } remove_wait_queue(&call->waitq, &myself); @@ -171,7 +174,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, /* * send data through a socket * - must be called in process context - * - caller holds the socket locked + * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -376,7 +379,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, if (!CMSG_OK(msg, cmsg)) return -EINVAL; - len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); @@ -437,10 +440,13 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, /* * Create a new client call for sendmsg(). + * - Called with the socket lock held, which it must release. + * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, unsigned long user_call_ID, bool exclusive) + __releases(&rx->sk.sk_lock.slock) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -450,8 +456,10 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, _enter(""); - if (!msg->msg_name) + if (!msg->msg_name) { + release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); + } key = rx->key; if (key && !rx->key->payload.data[0]) @@ -464,6 +472,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | exclusive; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + /* The socket is now unlocked */ _leave(" = %p\n", call); return call; @@ -475,6 +484,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) + __releases(&rx->sk.sk_lock.slock) { enum rxrpc_command cmd; struct rxrpc_call *call; @@ -488,12 +498,14 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, &exclusive); if (ret < 0) - return ret; + goto error_release_sock; if (cmd == RXRPC_CMD_ACCEPT) { + ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) - return -EINVAL; + goto error_release_sock; call = rxrpc_accept_call(rx, user_call_ID, NULL); + /* The socket is now unlocked. */ if (IS_ERR(call)) return PTR_ERR(call); rxrpc_put_call(call, rxrpc_call_put); @@ -502,12 +514,29 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) call = rxrpc_find_call_by_user_ID(rx, user_call_ID); if (!call) { + ret = -EBADSLT; if (cmd != RXRPC_CMD_SEND_DATA) - return -EBADSLT; + goto error_release_sock; + ret = -EBUSY; + if (call->state == RXRPC_CALL_UNINITIALISED || + call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || + call->state == RXRPC_CALL_SERVER_PREALLOC || + call->state == RXRPC_CALL_SERVER_SECURING || + call->state == RXRPC_CALL_SERVER_ACCEPTING) + goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, exclusive); + /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); + /* ... and we have the call lock. */ + } else { + ret = mutex_lock_interruptible(&call->user_mutex); + release_sock(&rx->sk); + if (ret < 0) { + ret = -ERESTARTSYS; + goto error_put; + } } _debug("CALL %d USR %lx ST %d on CONN %p", @@ -535,9 +564,15 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_data(rx, call, msg, len); } + mutex_unlock(&call->user_mutex); +error_put: rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; } /** @@ -562,7 +597,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); @@ -577,7 +612,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); } - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } @@ -598,12 +633,12 @@ void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, { _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); - lock_sock(sock->sk); + mutex_lock(&call->user_mutex); if (rxrpc_abort_call(why, call, 0, abort_code, error)) rxrpc_send_abort_packet(call); - release_sock(sock->sk); + mutex_unlock(&call->user_mutex); _leave(""); } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 87956a768d1b..403790cce7d2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -650,6 +650,18 @@ config NET_ACT_MIRRED To compile this code as a module, choose M here: the module will be called act_mirred. +config NET_ACT_SAMPLE + tristate "Traffic Sampling" + depends on NET_CLS_ACT + select PSAMPLE + ---help--- + Say Y here to allow packet sampling tc action. The packet sample + action consists of statistically choosing packets and sampling + them using the psample module. + + To compile this code as a module, choose M here: the + module will be called act_sample. + config NET_ACT_IPT tristate "IPtables targets" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES @@ -707,6 +719,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET + select LIBCRC32C ---help--- Say Y here to update some common checksum after some direct packet alterations. @@ -763,6 +776,7 @@ config NET_ACT_SKBMOD config NET_ACT_IFE tristate "Inter-FE action based on IETF ForCES InterFE LFB" depends on NET_CLS_ACT + select NET_IFE ---help--- Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: diff --git a/net/sched/Makefile b/net/sched/Makefile index 4bdda3634e0b..7b915d226de7 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += act_police.o obj-$(CONFIG_NET_ACT_GACT) += act_gact.o obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o +obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2095c83ce773..b70aa57319ea 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -24,6 +24,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> #include <net/act_api.h> #include <net/netlink.h> @@ -33,6 +34,12 @@ static void free_tcf(struct rcu_head *head) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); + + if (p->act_cookie) { + kfree(p->act_cookie->data); + kfree(p->act_cookie); + } + kfree(p); } @@ -426,11 +433,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, { int ret = -1, i; - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - ret = TC_ACT_OK; - goto exec_done; - } + if (skb_skip_tc_classify(skb)) + return TC_ACT_OK; + for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; @@ -439,9 +444,8 @@ repeat: if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) - goto exec_done; + break; } -exec_done: return ret; } EXPORT_SYMBOL(tcf_action_exec); @@ -478,6 +482,12 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; + if (a->act_cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, + a->act_cookie->data)) + goto nla_put_failure; + } + nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -519,6 +529,22 @@ errout: return err; } +static int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb) +{ + a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL); + if (!a->act_cookie) + return -ENOMEM; + + a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); + if (!a->act_cookie->data) { + kfree(a->act_cookie); + return -ENOMEM; + } + a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]); + + return 0; +} + struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind) @@ -578,6 +604,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, if (err < 0) goto err_mod; + if (tb[TCA_ACT_COOKIE]) { + int cklen = nla_len(tb[TCA_ACT_COOKIE]); + + if (cklen > TC_COOKIE_MAX_SIZE) { + err = -EINVAL; + tcf_hash_release(a, bind); + goto err_mod; + } + + if (nla_memdup_cookie(a, tb) < 0) { + err = -ENOMEM; + tcf_hash_release(a, bind); + goto err_mod; + } + } + /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then * ACT_P_CREATED is not returned (a zero is). @@ -817,10 +859,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); - if (err < 0) + if (err <= 0) goto out_module_put; - if (err == 0) - goto noflush_out; nla_nest_end(skb, nest); @@ -837,7 +877,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, out_module_put: module_put(ops->owner); err_out: -noflush_out: kfree_skb(skb); return err; } @@ -900,8 +939,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; - if (event == RTM_GETACTION) - act->tcfa_refcnt++; list_add_tail(&act->list, &actions); } @@ -914,7 +951,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - tcf_action_destroy(&actions, 0); + if (event != RTM_GETACTION) + tcf_action_destroy(&actions, 0); return ret; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1c60317f0121..520baa41cba3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -123,12 +123,11 @@ static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; - nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, - sizeof(prog->filter->digest)); + nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; - memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index a0edd80a44db..e978ccd4402c 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -30,6 +30,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/ip6_checksum.h> +#include <net/sctp/checksum.h> #include <net/act_api.h> @@ -322,6 +323,25 @@ ignore_obscure_skb: return 1; } +static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) +{ + struct sctphdr *sctph; + + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) + return 1; + + sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph)); + if (!sctph) + return 0; + + sctph->checksum = sctp_compute_cksum(skb, + skb_network_offset(skb) + ihl); + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) { const struct iphdr *iph; @@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) ntohs(iph->tot_len), 1)) goto fail; break; + case IPPROTO_SCTP: + if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && + !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len))) + goto fail; + break; } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { @@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) pl + sizeof(*ip6h), 1)) goto fail; goto done; + case IPPROTO_SCTP: + if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && + !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h))) + goto fail; + goto done; default: goto ignore_skb; } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 80b848d3f096..71e7ff22f7c9 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -32,6 +32,7 @@ #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> +#include <net/ife.h> #define IFE_TAB_MASK 15 @@ -46,23 +47,6 @@ static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_TYPE] = { .type = NLA_U16}, }; -/* Caller takes care of presenting data in network order -*/ -int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) -{ - u32 *tlv = (u32 *)(skbdata); - u16 totlen = nla_total_size(dlen); /*alignment + hdr */ - char *dptr = (char *)tlv + NLA_HDRLEN; - u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN); - - *tlv = htonl(htlv); - memset(dptr, 0, totlen - NLA_HDRLEN); - memcpy(dptr, dval, dlen); - - return totlen; -} -EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); - int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) { u16 edata = 0; @@ -637,69 +621,59 @@ int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, return 0; } -struct ifeheadr { - __be16 metalen; - u8 tlv_data[]; -}; - -struct meta_tlvhdr { - __be16 type; - __be16 len; -}; - static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; - struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; - int ifehdrln = (int)ifehdr->metalen; - struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data); + u8 *ifehdr_end; + u8 *tlv_data; + u16 metalen; spin_lock(&ife->tcf_lock); bstats_update(&ife->tcf_bstats, skb); tcf_lastuse_update(&ife->tcf_tm); spin_unlock(&ife->tcf_lock); - ifehdrln = ntohs(ifehdrln); - if (unlikely(!pskb_may_pull(skb, ifehdrln))) { + if (skb_at_tc_ingress(skb)) + skb_push(skb, skb->dev->hard_header_len); + + tlv_data = ife_decode(skb, &metalen); + if (unlikely(!tlv_data)) { spin_lock(&ife->tcf_lock); ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); return TC_ACT_SHOT; } - skb_set_mac_header(skb, ifehdrln); - __skb_pull(skb, ifehdrln); - skb->protocol = eth_type_trans(skb, skb->dev); - ifehdrln -= IFE_METAHDRLEN; - - while (ifehdrln > 0) { - u8 *tlvdata = (u8 *)tlv; - u16 mtype = tlv->type; - u16 mlen = tlv->len; - u16 alen; + ifehdr_end = tlv_data + metalen; + for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) { + u8 *curr_data; + u16 mtype; + u16 dlen; - mtype = ntohs(mtype); - mlen = ntohs(mlen); - alen = NLA_ALIGN(mlen); + curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL); - if (find_decode_metaid(skb, ife, mtype, (mlen - NLA_HDRLEN), - (void *)(tlvdata + NLA_HDRLEN))) { + if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ - pr_info_ratelimited("Unknown metaid %d alnlen %d\n", - mtype, mlen); + pr_info_ratelimited("Unknown metaid %d dlen %d\n", + mtype, dlen); ife->tcf_qstats.overlimits++; } + } - tlvdata += alen; - ifehdrln -= alen; - tlv = (struct meta_tlvhdr *)tlvdata; + if (WARN_ON(tlv_data != ifehdr_end)) { + spin_lock(&ife->tcf_lock); + ife->tcf_qstats.drops++; + spin_unlock(&ife->tcf_lock); + return TC_ACT_SHOT; } + skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); + return action; } @@ -727,7 +701,6 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ - struct ethhdr *iethh; /* inner eth header */ struct tcf_meta_info *e; /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA @@ -735,13 +708,13 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, */ u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; - unsigned int skboff = skb->dev->hard_header_len; - u32 at = G_TC_AT(skb->tc_verd); + unsigned int skboff = 0; int new_len = skb->len + hdrm; bool exceed_mtu = false; - int err; + void *ife_meta; + int err = 0; - if (at & AT_EGRESS) { + if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } @@ -766,27 +739,10 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - err = skb_cow_head(skb, hdrm); - if (unlikely(err)) { - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); - return TC_ACT_SHOT; - } - - if (!(at & AT_EGRESS)) + if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); - iethh = (struct ethhdr *)skb->data; - __skb_push(skb, hdrm); - memcpy(skb->data, iethh, skb->mac_len); - skb_reset_mac_header(skb); - oethh = eth_hdr(skb); - - /*total metadata length */ - metalen += IFE_METAHDRLEN; - metalen = htons(metalen); - memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN); - skboff += IFE_METAHDRLEN; + ife_meta = ife_encode(skb, metalen); /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by @@ -794,7 +750,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, */ list_for_each_entry(e, &ife->metalist, metalist) { if (e->ops->encode) { - err = e->ops->encode(skb, (void *)(skb->data + skboff), + err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { @@ -805,18 +761,15 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } skboff += err; } + oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(ife->eth_src)) ether_addr_copy(oethh->h_source, ife->eth_src); - else - ether_addr_copy(oethh->h_source, iethh->h_source); if (!is_zero_ether_addr(ife->eth_dst)) ether_addr_copy(oethh->h_dest, ife->eth_dst); - else - ether_addr_copy(oethh->h_dest, iethh->h_dest); oethh->h_proto = htons(ife->eth_type); - if (!(at & AT_EGRESS)) + if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); spin_unlock(&ife->tcf_lock); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2d9fa6e0a1b4..af49c7dca860 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -28,8 +28,6 @@ #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> -#include <linux/if_arp.h> - #define MIRRED_TAB_MASK 7 static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); @@ -39,15 +37,15 @@ static bool tcf_mirred_is_act_redirect(int action) return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } -static u32 tcf_mirred_act_direction(int action) +static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: - return AT_EGRESS; + return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: - return AT_INGRESS; + return true; default: BUG(); } @@ -170,7 +168,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, int retval, err = 0; int m_eaction; int mac_len; - u32 at; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -191,7 +188,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, goto out; } - at = G_TC_AT(skb->tc_verd); skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out; @@ -200,8 +196,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, * and devices expect a mac header on xmit, then mac push/pull is * needed. */ - if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) { - if (at & AT_EGRESS) { + if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) && + m_mac_header_xmit) { + if (!skb_at_tc_ingress(skb)) { /* caught at egress, act ingress: pull mac */ mac_len = skb_network_header(skb) - skb_mac_header(skb); skb_pull_rcsum(skb2, mac_len); @@ -212,12 +209,14 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } /* mirror is always swallowed */ - if (tcf_mirred_is_act_redirect(m_eaction)) - skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + if (tcf_mirred_is_act_redirect(m_eaction)) { + skb2->tc_redirected = 1; + skb2->tc_from_ingress = skb2->tc_at_ingress; + } skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; - if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS) + if (!tcf_mirred_act_wants_ingress(m_eaction)) err = dev_queue_xmit(skb2); else err = netif_receive_skb(skb2); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b27c4daec88f..c1310472f620 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -22,6 +22,7 @@ #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> +#include <uapi/linux/tc_act/tc_pedit.h> #define PEDIT_TAB_MASK 15 @@ -30,18 +31,117 @@ static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, + [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; +static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { + [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 }, + [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 }, +}; + +static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, + u8 n) +{ + struct tcf_pedit_key_ex *keys_ex; + struct tcf_pedit_key_ex *k; + const struct nlattr *ka; + int err = -EINVAL; + int rem; + + if (!nla || !n) + return NULL; + + keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); + if (!keys_ex) + return ERR_PTR(-ENOMEM); + + k = keys_ex; + + nla_for_each_nested(ka, nla, rem) { + struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; + + if (!n) { + err = -EINVAL; + goto err_out; + } + n--; + + if (nla_type(ka) != TCA_PEDIT_KEY_EX) { + err = -EINVAL; + goto err_out; + } + + err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka, + pedit_key_ex_policy); + if (err) + goto err_out; + + if (!tb[TCA_PEDIT_KEY_EX_HTYPE] || + !tb[TCA_PEDIT_KEY_EX_CMD]) { + err = -EINVAL; + goto err_out; + } + + k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); + k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); + + if (k->htype > TCA_PEDIT_HDR_TYPE_MAX || + k->cmd > TCA_PEDIT_CMD_MAX) { + err = -EINVAL; + goto err_out; + } + + k++; + } + + if (n) + goto err_out; + + return keys_ex; + +err_out: + kfree(keys_ex); + return ERR_PTR(err); +} + +static int tcf_pedit_key_ex_dump(struct sk_buff *skb, + struct tcf_pedit_key_ex *keys_ex, int n) +{ + struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX); + + for (; n > 0; n--) { + struct nlattr *key_start; + + key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX); + + if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || + nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) { + nlmsg_trim(skb, keys_start); + return -EINVAL; + } + + nla_nest_end(skb, key_start); + + keys_ex++; + } + + nla_nest_end(skb, keys_start); + + return 0; +} + static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; + struct nlattr *pattr; struct tc_pedit *parm; int ret = 0, err; struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; + struct tcf_pedit_key_ex *keys_ex; int ksize; if (nla == NULL) @@ -51,13 +151,21 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (err < 0) return err; - if (tb[TCA_PEDIT_PARMS] == NULL) + pattr = tb[TCA_PEDIT_PARMS]; + if (!pattr) + pattr = tb[TCA_PEDIT_PARMS_EX]; + if (!pattr) return -EINVAL; - parm = nla_data(tb[TCA_PEDIT_PARMS]); + + parm = nla_data(pattr); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) + if (nla_len(pattr) < sizeof(*parm) + ksize) return -EINVAL; + keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); + if (IS_ERR(keys_ex)) + return PTR_ERR(keys_ex); + if (!tcf_hash_check(tn, parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; @@ -69,6 +177,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { tcf_hash_cleanup(*a, est); + kfree(keys_ex); return -ENOMEM; } ret = ACT_P_CREATED; @@ -81,8 +190,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); - if (keys == NULL) + if (!keys) { + kfree(keys_ex); return -ENOMEM; + } } } @@ -95,6 +206,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + + kfree(p->tcfp_keys_ex); + p->tcfp_keys_ex = keys_ex; + spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(tn, *a); @@ -106,6 +221,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind) struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; kfree(keys); + kfree(p->tcfp_keys_ex); } static bool offset_valid(struct sk_buff *skb, int offset) @@ -119,38 +235,88 @@ static bool offset_valid(struct sk_buff *skb, int offset) return true; } +static int pedit_skb_hdr_offset(struct sk_buff *skb, + enum pedit_header_type htype, int *hoffset) +{ + int ret = -EINVAL; + + switch (htype) { + case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: + if (skb_mac_header_was_set(skb)) { + *hoffset = skb_mac_offset(skb); + ret = 0; + } + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: + case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: + case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: + *hoffset = skb_network_offset(skb); + ret = 0; + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: + case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: + if (skb_transport_header_was_set(skb)) { + *hoffset = skb_transport_offset(skb); + ret = 0; + } + break; + default: + ret = -EINVAL; + break; + }; + + return ret; +} + static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); int i; - unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) return p->tcf_action; - off = skb_network_offset(skb); - spin_lock(&p->tcf_lock); tcf_lastuse_update(&p->tcf_tm); if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; + struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; + enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { u32 *ptr, _data; int offset = tkey->off; + int hoffset; + u32 val; + int rc; + + if (tkey_ex) { + htype = tkey_ex->htype; + cmd = tkey_ex->cmd; + + tkey_ex++; + } + + rc = pedit_skb_hdr_offset(skb, htype, &hoffset); + if (rc) { + pr_info("tc filter pedit bad header type specified (0x%x)\n", + htype); + goto bad; + } if (tkey->offmask) { char *d, _d; - if (!offset_valid(skb, off + tkey->at)) { + if (!offset_valid(skb, hoffset + tkey->at)) { pr_info("tc filter pedit 'at' offset %d out of bounds\n", - off + tkey->at); + hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, off + tkey->at, 1, + d = skb_header_pointer(skb, hoffset + tkey->at, 1, &_d); if (!d) goto bad; @@ -163,19 +329,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, goto bad; } - if (!offset_valid(skb, off + offset)) { + if (!offset_valid(skb, hoffset + offset)) { pr_info("tc filter pedit offset %d out of bounds\n", - offset); + hoffset + offset); goto bad; } - ptr = skb_header_pointer(skb, off + offset, 4, &_data); + ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); if (!ptr) goto bad; /* just do it, baby */ - *ptr = ((*ptr & tkey->mask) ^ tkey->val); + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + val = tkey->val; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + val = (*ptr + tkey->val) & ~tkey->mask; + break; + default: + pr_info("tc filter pedit bad command (%d)\n", + cmd); + goto bad; + } + + *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &_data) - skb_store_bits(skb, off + offset, ptr, 4); + skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; @@ -215,8 +394,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->refcnt = p->tcf_refcnt - ref; opt->bindcnt = p->tcf_bindcnt - bind; - if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) - goto nla_put_failure; + if (p->tcfp_keys_ex) { + tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); + + if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) + goto nla_put_failure; + } else { + if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) + goto nla_put_failure; + } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c new file mode 100644 index 000000000000..0b8217b4763f --- /dev/null +++ b/net/sched/act_sample.c @@ -0,0 +1,276 @@ +/* + * net/sched/act_sample.c - Packet sampling tc action + * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <linux/tc_act/tc_sample.h> +#include <net/tc_act/tc_sample.h> +#include <net/psample.h> + +#include <linux/if_arp.h> + +#define SAMPLE_TAB_MASK 7 +static unsigned int sample_net_id; +static struct tc_action_ops act_sample_ops; + +static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { + [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, + [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, + [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, + [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, +}; + +static int tcf_sample_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, int ovr, + int bind) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + struct nlattr *tb[TCA_SAMPLE_MAX + 1]; + struct psample_group *psample_group; + struct tc_sample *parm; + struct tcf_sample *s; + bool exists = false; + int ret; + + if (!nla) + return -EINVAL; + ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy); + if (ret < 0) + return ret; + if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || + !tb[TCA_SAMPLE_PSAMPLE_GROUP]) + return -EINVAL; + + parm = nla_data(tb[TCA_SAMPLE_PARMS]); + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_sample_ops, bind, false); + if (ret) + return ret; + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + s = to_sample(*a); + + s->tcf_action = parm->action; + s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); + s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); + psample_group = psample_group_get(net, s->psample_group_num); + if (!psample_group) { + if (ret == ACT_P_CREATED) + tcf_hash_release(*a, bind); + return -ENOMEM; + } + RCU_INIT_POINTER(s->psample_group, psample_group); + + if (tb[TCA_SAMPLE_TRUNC_SIZE]) { + s->truncate = true; + s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); + } + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + return ret; +} + +static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) +{ + struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); + struct psample_group *psample_group; + + psample_group = rcu_dereference_protected(s->psample_group, 1); + RCU_INIT_POINTER(s->psample_group, NULL); + psample_group_put(psample_group); +} + +static void tcf_sample_cleanup(struct tc_action *a, int bind) +{ + struct tcf_sample *s = to_sample(a); + + call_rcu(&s->rcu, tcf_sample_cleanup_rcu); +} + +static bool tcf_sample_dev_ok_push(struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + return false; + default: + return true; + } +} + +static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_sample *s = to_sample(a); + struct psample_group *psample_group; + int retval; + int size; + int iif; + int oif; + + tcf_lastuse_update(&s->tcf_tm); + bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + retval = READ_ONCE(s->tcf_action); + + rcu_read_lock(); + psample_group = rcu_dereference(s->psample_group); + + /* randomly sample packets according to rate */ + if (psample_group && (prandom_u32() % s->rate == 0)) { + if (!skb_at_tc_ingress(skb)) { + iif = skb->skb_iif; + oif = skb->dev->ifindex; + } else { + iif = skb->dev->ifindex; + oif = 0; + } + + /* on ingress, the mac header gets popped, so push it back */ + if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) + skb_push(skb, skb->mac_len); + + size = s->truncate ? s->trunc_size : skb->len; + psample_sample_packet(psample_group, skb, size, iif, oif, + s->rate); + + if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) + skb_pull(skb, skb->mac_len); + } + + rcu_read_unlock(); + return retval; +} + +static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_sample *s = to_sample(a); + struct tc_sample opt = { + .index = s->tcf_index, + .action = s->tcf_action, + .refcnt = s->tcf_refcnt - ref, + .bindcnt = s->tcf_bindcnt - bind, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + tcf_tm_dump(&t, &s->tcf_tm); + if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) + goto nla_put_failure; + + if (s->truncate) + if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_sample_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_sample_ops = { + .kind = "sample", + .type = TCA_ACT_SAMPLE, + .owner = THIS_MODULE, + .act = tcf_sample_act, + .dump = tcf_sample_dump, + .init = tcf_sample_init, + .cleanup = tcf_sample_cleanup, + .walk = tcf_sample_walker, + .lookup = tcf_sample_search, + .size = sizeof(struct tcf_sample), +}; + +static __net_init int sample_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK); +} + +static void __net_exit sample_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations sample_net_ops = { + .init = sample_init_net, + .exit = sample_exit_net, + .id = &sample_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init sample_init_module(void) +{ + return tcf_register_action(&act_sample_ops, &sample_net_ops); +} + +static void __exit sample_cleanup_module(void) +{ + tcf_unregister_action(&act_sample_ops, &sample_net_ops); +} + +module_init(sample_init_module); +module_exit(sample_cleanup_module); + +MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>"); +MODULE_DESCRIPTION("Packet sampling action"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3fbba79a4ef0..732f7cae459d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/err.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> @@ -38,14 +39,14 @@ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) +static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { - if (nla_strcmp(kind, t->kind) == 0) { + if (strcmp(kind, t->kind) == 0) { if (try_module_get(t->owner)) res = t; break; @@ -127,6 +128,77 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return first; } +static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, + u32 prio, u32 parent, struct Qdisc *q) +{ + struct tcf_proto *tp; + int err; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOBUFS); + + err = -ENOENT; + tp->ops = tcf_proto_lookup_ops(kind); + if (!tp->ops) { +#ifdef CONFIG_MODULES + rtnl_unlock(); + request_module("cls_%s", kind); + rtnl_lock(); + tp->ops = tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to perform + * the module load. So, even if we succeeded in loading + * the module we have to replay the request. We indicate + * this using -EAGAIN. + */ + if (tp->ops) { + module_put(tp->ops->owner); + err = -EAGAIN; + } else { + err = -ENOENT; + } + goto errout; +#endif + } + tp->classify = tp->ops->classify; + tp->protocol = protocol; + tp->prio = prio; + tp->classid = parent; + tp->q = q; + + err = tp->ops->init(tp); + if (err) { + module_put(tp->ops->owner); + goto errout; + } + return tp; + +errout: + kfree(tp); + return ERR_PTR(err); +} + +static bool tcf_proto_destroy(struct tcf_proto *tp, bool force) +{ + if (tp->ops->destroy(tp, force)) { + module_put(tp->ops->owner); + kfree_rcu(tp, rcu); + return true; + } + return false; +} + +void tcf_destroy_chain(struct tcf_proto __rcu **fl) +{ + struct tcf_proto *tp; + + while ((tp = rtnl_dereference(*fl)) != NULL) { + RCU_INIT_POINTER(*fl, tp->next); + tcf_proto_destroy(tp, true); + } +} +EXPORT_SYMBOL(tcf_destroy_chain); + /* Add/change/delete/get a filter node */ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) @@ -142,19 +214,21 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) struct Qdisc *q; struct tcf_proto __rcu **back; struct tcf_proto __rcu **chain; + struct tcf_proto *next; struct tcf_proto *tp; - const struct tcf_proto_ops *tp_ops; const struct Qdisc_class_ops *cops; unsigned long cl; unsigned long fh; int err; - int tp_created = 0; + int tp_created; if ((n->nlmsg_type != RTM_GETTFILTER) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; replay: + tp_created = 0; + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); if (err < 0) return err; @@ -220,9 +294,10 @@ replay: /* And the last stroke */ chain = cops->tcf_chain(q, cl); - err = -EINVAL; - if (chain == NULL) + if (chain == NULL) { + err = -EINVAL; goto errout; + } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); tcf_destroy_chain(chain); @@ -237,10 +312,13 @@ replay: if (tp->prio >= prio) { if (tp->prio == prio) { if (!nprio || - (tp->protocol != protocol && protocol)) + (tp->protocol != protocol && protocol)) { + err = -EINVAL; goto errout; - } else + } + } else { tp = NULL; + } break; } } @@ -248,109 +326,69 @@ replay: if (tp == NULL) { /* Proto-tcf does not exist, create new one */ - if (tca[TCA_KIND] == NULL || !protocol) + if (tca[TCA_KIND] == NULL || !protocol) { + err = -EINVAL; goto errout; + } - err = -ENOENT; if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) + !(n->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto errout; + } + if (!nprio) + nprio = TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); - /* Create new proto tcf */ - - err = -ENOBUFS; - tp = kzalloc(sizeof(*tp), GFP_KERNEL); - if (tp == NULL) - goto errout; - err = -ENOENT; - tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); - if (tp_ops == NULL) { -#ifdef CONFIG_MODULES - struct nlattr *kind = tca[TCA_KIND]; - char name[IFNAMSIZ]; - - if (kind != NULL && - nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { - rtnl_unlock(); - request_module("cls_%s", name); - rtnl_lock(); - tp_ops = tcf_proto_lookup_ops(kind); - /* We dropped the RTNL semaphore in order to - * perform the module load. So, even if we - * succeeded in loading the module we have to - * replay the request. We indicate this using - * -EAGAIN. - */ - if (tp_ops != NULL) { - module_put(tp_ops->owner); - err = -EAGAIN; - } - } -#endif - kfree(tp); - goto errout; - } - tp->ops = tp_ops; - tp->protocol = protocol; - tp->prio = nprio ? : - TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); - tp->q = q; - tp->classify = tp_ops->classify; - tp->classid = parent; - - err = tp_ops->init(tp); - if (err != 0) { - module_put(tp_ops->owner); - kfree(tp); + tp = tcf_proto_create(nla_data(tca[TCA_KIND]), + protocol, nprio, parent, q); + if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout; } - tp_created = 1; - - } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { + err = -EINVAL; goto errout; + } fh = tp->ops->get(tp, t->tcm_handle); if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - struct tcf_proto *next = rtnl_dereference(tp->next); - + next = rtnl_dereference(tp->next); RCU_INIT_POINTER(*back, next); - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER, false); - tcf_destroy(tp, true); + tcf_proto_destroy(tp, true); err = 0; goto errout; } - err = -ENOENT; if (n->nlmsg_type != RTM_NEWTFILTER || - !(n->nlmsg_flags & NLM_F_CREATE)) + !(n->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto errout; + } } else { switch (n->nlmsg_type) { case RTM_NEWTFILTER: - err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) { if (tp_created) - tcf_destroy(tp, true); + tcf_proto_destroy(tp, true); + err = -EEXIST; goto errout; } break; case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); - if (err == 0) { - struct tcf_proto *next = rtnl_dereference(tp->next); - - tfilter_notify(net, skb, n, tp, - t->tcm_handle, - RTM_DELTFILTER, false); - if (tcf_destroy(tp, false)) - RCU_INIT_POINTER(*back, next); - } + if (err) + goto errout; + next = rtnl_dereference(tp->next); + tfilter_notify(net, skb, n, tp, t->tcm_handle, + RTM_DELTFILTER, false); + if (tcf_proto_destroy(tp, false)) + RCU_INIT_POINTER(*back, next); goto errout; case RTM_GETTFILTER: err = tfilter_notify(net, skb, n, tp, fh, @@ -372,7 +410,7 @@ replay: tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); } else { if (tp_created) - tcf_destroy(tp, true); + tcf_proto_destroy(tp, true); } errout: diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index adc776048d1a..80f688436dd7 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -148,6 +148,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_bpf_offload bpf_offload = {}; struct tc_to_netdev offload; + int err; offload.type = TC_SETUP_CLSBPF; offload.cls_bpf = &bpf_offload; @@ -159,8 +160,13 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, bpf_offload.exts_integrated = prog->exts_integrated; bpf_offload.gen_flags = prog->gen_flags; - return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, - tp->protocol, &offload); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + + if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE)) + prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + + return err; } static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, @@ -511,6 +517,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, return ret; } + if (!tc_in_hw(prog->gen_flags)) + prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; + if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); @@ -555,11 +564,11 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; - nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; - memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 6575aba87630..3d6b9286c203 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -129,7 +129,7 @@ static u32 flow_get_mark(const struct sk_buff *skb) static u32 flow_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) - return addr_fold(skb->nfct); + return addr_fold(skb_nfct(skb)); #else return 0; #endif diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 333f8e268431..9d0c99d2e9fb 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -40,6 +40,7 @@ struct fl_flow_key { }; struct flow_dissector_key_ports tp; struct flow_dissector_key_icmp icmp; + struct flow_dissector_key_arp arp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; @@ -133,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } +static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head, + struct fl_flow_key *mkey) +{ + return rhashtable_lookup_fast(&head->ht, + fl_key_get_start(mkey, &head->mask), + head->ht_params); +} + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -153,10 +162,14 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, switch (ip_tunnel_info_af(info)) { case AF_INET: + skb_key.enc_control.addr_type = + FLOW_DISSECTOR_KEY_IPV4_ADDRS; skb_key.enc_ipv4.src = key->u.ipv4.src; skb_key.enc_ipv4.dst = key->u.ipv4.dst; break; case AF_INET6: + skb_key.enc_control.addr_type = + FLOW_DISSECTOR_KEY_IPV6_ADDRS; skb_key.enc_ipv6.src = key->u.ipv6.src; skb_key.enc_ipv6.dst = key->u.ipv6.dst; break; @@ -176,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); - f = rhashtable_lookup_fast(&head->ht, - fl_key_get_start(&skb_mkey, &head->mask), - head->ht_params); + f = fl_lookup(head, &skb_mkey); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -218,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) return; offload.command = TC_CLSFLOWER_DESTROY; + offload.prio = tp->prio; offload.cookie = (unsigned long)f; tc->type = TC_SETUP_CLSFLOWER; @@ -249,6 +261,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, } offload.command = TC_CLSFLOWER_REPLACE; + offload.prio = tp->prio; offload.cookie = (unsigned long)f; offload.dissector = dissector; offload.mask = mask; @@ -260,6 +273,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc); + if (!err) + f->flags |= TCA_CLS_FLAGS_IN_HW; if (tc_skip_sw(f->flags)) return err; @@ -276,6 +291,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) return; offload.command = TC_CLSFLOWER_STATS; + offload.prio = tp->prio; offload.cookie = (unsigned long)f; offload.exts = &f->exts; @@ -397,6 +413,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN }, }; static void fl_set_key_val(struct nlattr **tb, @@ -564,10 +590,27 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, sizeof(key->icmp.type)); - fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, - TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_ARP) || + key->basic.n_proto == htons(ETH_P_RARP)) { + fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, + &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK, + sizeof(key->arp.sip)); + fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP, + &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK, + sizeof(key->arp.tip)); + fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP, + &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK, + sizeof(key->arp.op)); + fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, + mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, + sizeof(key->arp.sha)); + fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, + mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, + sizeof(key->arp.tha)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -685,6 +728,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ARP, arp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); @@ -792,23 +837,31 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; struct cls_fl_filter *fnew; - struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct nlattr **tb; struct fl_flow_mask mask = {}; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; + tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + if (!tb) + return -ENOBUFS; + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); if (err < 0) - return err; + goto errout_tb; - if (fold && handle && fold->handle != handle) - return -EINVAL; + if (fold && handle && fold->handle != handle) { + err = -EINVAL; + goto errout_tb; + } fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); - if (!fnew) - return -ENOBUFS; + if (!fnew) { + err = -ENOBUFS; + goto errout_tb; + } err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); if (err < 0) @@ -841,6 +894,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout; if (!tc_skip_sw(fnew->flags)) { + if (!fold && fl_lookup(head, &fnew->mkey)) { + err = -EEXIST; + goto errout; + } + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) @@ -856,6 +914,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout; } + if (!tc_in_hw(fnew->flags)) + fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + if (fold) { if (!tc_skip_sw(fold->flags)) rhashtable_remove_fast(&head->ht, &fold->ht_node, @@ -874,11 +935,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, list_add_tail_rcu(&fnew->list, &head->filters); } + kfree(tb); return 0; errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); +errout_tb: + kfree(tb); return err; } @@ -1108,6 +1172,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)))) goto nla_put_failure; + else if ((key->basic.n_proto == htons(ETH_P_ARP) || + key->basic.n_proto == htons(ETH_P_RARP)) && + (fl_dump_key_val(skb, &key->arp.sip, + TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip, + TCA_FLOWER_KEY_ARP_SIP_MASK, + sizeof(key->arp.sip)) || + fl_dump_key_val(skb, &key->arp.tip, + TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip, + TCA_FLOWER_KEY_ARP_TIP_MASK, + sizeof(key->arp.tip)) || + fl_dump_key_val(skb, &key->arp.op, + TCA_FLOWER_KEY_ARP_OP, &mask->arp.op, + TCA_FLOWER_KEY_ARP_OP_MASK, + sizeof(key->arp.op)) || + fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, + mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, + sizeof(key->arp.sha)) || + fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, + mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, + sizeof(key->arp.tha)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, @@ -1149,7 +1234,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; - nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index f935429bd5ef..224eb2c14346 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -16,16 +16,11 @@ #include <net/sch_generic.h> #include <net/pkt_cls.h> -struct cls_mall_filter { +struct cls_mall_head { struct tcf_exts exts; struct tcf_result res; u32 handle; - struct rcu_head rcu; u32 flags; -}; - -struct cls_mall_head { - struct cls_mall_filter *filter; struct rcu_head rcu; }; @@ -33,56 +28,52 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_mall_head *head = rcu_dereference_bh(tp->root); - struct cls_mall_filter *f = head->filter; - if (tc_skip_sw(f->flags)) + if (tc_skip_sw(head->flags)) return -1; - return tcf_exts_exec(skb, &f->exts, res); + return tcf_exts_exec(skb, &head->exts, res); } static int mall_init(struct tcf_proto *tp) { - struct cls_mall_head *head; - - head = kzalloc(sizeof(*head), GFP_KERNEL); - if (!head) - return -ENOBUFS; - - rcu_assign_pointer(tp->root, head); - return 0; } -static void mall_destroy_filter(struct rcu_head *head) +static void mall_destroy_rcu(struct rcu_head *rcu) { - struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); + struct cls_mall_head *head = container_of(rcu, struct cls_mall_head, + rcu); - tcf_exts_destroy(&f->exts); - - kfree(f); + tcf_exts_destroy(&head->exts); + kfree(head); } static int mall_replace_hw_filter(struct tcf_proto *tp, - struct cls_mall_filter *f, + struct cls_mall_head *head, unsigned long cookie) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_to_netdev offload; struct tc_cls_matchall_offload mall_offload = {0}; + int err; offload.type = TC_SETUP_MATCHALL; offload.cls_mall = &mall_offload; offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; - offload.cls_mall->exts = &f->exts; + offload.cls_mall->exts = &head->exts; offload.cls_mall->cookie = cookie; - return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, - &offload); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); + if (!err) + head->flags |= TCA_CLS_FLAGS_IN_HW; + + return err; } static void mall_destroy_hw_filter(struct tcf_proto *tp, - struct cls_mall_filter *f, + struct cls_mall_head *head, unsigned long cookie) { struct net_device *dev = tp->q->dev_queue->dev; @@ -103,29 +94,20 @@ static bool mall_destroy(struct tcf_proto *tp, bool force) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct net_device *dev = tp->q->dev_queue->dev; - struct cls_mall_filter *f = head->filter; - if (!force && f) - return false; + if (!head) + return true; - if (f) { - if (tc_should_offload(dev, tp, f->flags)) - mall_destroy_hw_filter(tp, f, (unsigned long) f); + if (tc_should_offload(dev, tp, head->flags)) + mall_destroy_hw_filter(tp, head, (unsigned long) head); - call_rcu(&f->rcu, mall_destroy_filter); - } - kfree_rcu(head, rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return true; } static unsigned long mall_get(struct tcf_proto *tp, u32 handle) { - struct cls_mall_head *head = rtnl_dereference(tp->root); - struct cls_mall_filter *f = head->filter; - - if (f && f->handle == handle) - return (unsigned long) f; - return 0; + return 0UL; } static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { @@ -134,26 +116,31 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { }; static int mall_set_parms(struct net *net, struct tcf_proto *tp, - struct cls_mall_filter *f, + struct cls_mall_head *head, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { struct tcf_exts e; int err; - tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + if (err) + return err; err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) - return err; + goto errout; if (tb[TCA_MATCHALL_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); - tcf_bind_filter(tp, &f->res, base); + head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); + tcf_bind_filter(tp, &head->res, base); } - tcf_exts_change(tp, &f->exts, &e); + tcf_exts_change(tp, &head->exts, &e); return 0; +errout: + tcf_exts_destroy(&e); + return err; } static int mall_change(struct net *net, struct sk_buff *in_skb, @@ -162,21 +149,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, unsigned long *arg, bool ovr) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg; struct net_device *dev = tp->q->dev_queue->dev; - struct cls_mall_filter *f; struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + struct cls_mall_head *new; u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; - if (head->filter) - return -EBUSY; - - if (fold) - return -EINVAL; + if (head) + return -EEXIST; err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS], mall_policy); @@ -189,64 +172,62 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) return -ENOBUFS; - tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); + err = tcf_exts_init(&new->exts, TCA_MATCHALL_ACT, 0); + if (err) + goto err_exts_init; if (!handle) handle = 1; - f->handle = handle; - f->flags = flags; + new->handle = handle; + new->flags = flags; - err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); + err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto err_set_parms; if (tc_should_offload(dev, tp, flags)) { - err = mall_replace_hw_filter(tp, f, (unsigned long) f); + err = mall_replace_hw_filter(tp, new, (unsigned long) new); if (err) { if (tc_skip_sw(flags)) - goto errout; + goto err_replace_hw_filter; else err = 0; } } - *arg = (unsigned long) f; - rcu_assign_pointer(head->filter, f); + if (!tc_in_hw(new->flags)) + new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + *arg = (unsigned long) head; + rcu_assign_pointer(tp->root, new); + if (head) + call_rcu(&head->rcu, mall_destroy_rcu); return 0; -errout: - kfree(f); +err_replace_hw_filter: +err_set_parms: + tcf_exts_destroy(&new->exts); +err_exts_init: + kfree(new); return err; } static int mall_delete(struct tcf_proto *tp, unsigned long arg) { - struct cls_mall_head *head = rtnl_dereference(tp->root); - struct cls_mall_filter *f = (struct cls_mall_filter *) arg; - struct net_device *dev = tp->q->dev_queue->dev; - - if (tc_should_offload(dev, tp, f->flags)) - mall_destroy_hw_filter(tp, f, (unsigned long) f); - - RCU_INIT_POINTER(head->filter, NULL); - tcf_unbind_filter(tp, &f->res); - call_rcu(&f->rcu, mall_destroy_filter); - return 0; + return -EOPNOTSUPP; } static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct cls_mall_filter *f = head->filter; if (arg->count < arg->skip) goto skip; - if (arg->fn(tp, (unsigned long) f, arg) < 0) + if (arg->fn(tp, (unsigned long) head, arg) < 0) arg->stop = 1; skip: arg->count++; @@ -255,28 +236,31 @@ skip: static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct cls_mall_filter *f = (struct cls_mall_filter *) fh; + struct cls_mall_head *head = (struct cls_mall_head *) fh; struct nlattr *nest; - if (!f) + if (!head) return skb->len; - t->tcm_handle = f->handle; + t->tcm_handle = head->handle; nest = nla_nest_start(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; - if (f->res.classid && - nla_put_u32(skb, TCA_MATCHALL_CLASSID, f->res.classid)) + if (head->res.classid && + nla_put_u32(skb, TCA_MATCHALL_CLASSID, head->res.classid)) + goto nla_put_failure; + + if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags)) goto nla_put_failure; - if (tcf_exts_dump(skb, &f->exts)) + if (tcf_exts_dump(skb, &head->exts)) goto nla_put_failure; nla_nest_end(skb, nest); - if (tcf_exts_dump_stats(skb, &f->exts) < 0) + if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ae83c3aec308..4dbe0c680fe6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp) if (root_ht == NULL) return -ENOBUFS; - root_ht->divisor = 0; root_ht->refcnt++; root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; root_ht->prio = tp->prio; @@ -524,6 +523,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &offload); + + if (!err) + n->flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(flags)) return err; @@ -896,6 +899,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } + if (!tc_in_hw(new->flags)) + new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); @@ -1015,6 +1021,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err) goto errhw; + if (!tc_in_hw(n->flags)) + n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d7b93429f0cc..bcf49cd22786 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -440,7 +440,6 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) EXPORT_SYMBOL(qdisc_put_rtab); static LIST_HEAD(qdisc_stab_list); -static DEFINE_SPINLOCK(qdisc_stab_lock); static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, @@ -474,20 +473,15 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) if (tsize != s->tsize || (!tab && tsize > 0)) return ERR_PTR(-EINVAL); - spin_lock(&qdisc_stab_lock); - list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) continue; stab->refcnt++; - spin_unlock(&qdisc_stab_lock); return stab; } - spin_unlock(&qdisc_stab_lock); - stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); @@ -497,9 +491,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) if (tsize > 0) memcpy(stab->data, tab, tsize * sizeof(u16)); - spin_lock(&qdisc_stab_lock); list_add_tail(&stab->list, &qdisc_stab_list); - spin_unlock(&qdisc_stab_lock); return stab; } @@ -514,14 +506,10 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (!tab) return; - spin_lock(&qdisc_stab_lock); - if (--tab->refcnt == 0) { list_del(&tab->list); call_rcu_bh(&tab->rcu, stab_kfree_rcu); } - - spin_unlock(&qdisc_stab_lock); } EXPORT_SYMBOL(qdisc_put_stab); @@ -1019,6 +1007,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; } + /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ + ops->destroy(sch); err_out3: dev_put(dev); kfree((char *) sch - sch->padded); @@ -1861,6 +1851,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, { __be16 protocol = tc_skb_protocol(skb); #ifdef CONFIG_NET_CLS_ACT + const int max_reclassify_loop = 4; const struct tcf_proto *old_tp = tp; int limit = 0; @@ -1885,7 +1876,7 @@ reclassify: return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: - if (unlikely(limit++ >= MAX_REC_LOOP)) { + if (unlikely(limit++ >= max_reclassify_loop)) { net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); @@ -1899,28 +1890,6 @@ reset: } EXPORT_SYMBOL(tc_classify); -bool tcf_destroy(struct tcf_proto *tp, bool force) -{ - if (tp->ops->destroy(tp, force)) { - module_put(tp->ops->owner); - kfree_rcu(tp, rcu); - return true; - } - - return false; -} - -void tcf_destroy_chain(struct tcf_proto __rcu **fl) -{ - struct tcf_proto *tp; - - while ((tp = rtnl_dereference(*fl)) != NULL) { - RCU_INIT_POINTER(*fl, tp->next); - tcf_destroy(tp, true); - } -} -EXPORT_SYMBOL(tcf_destroy_chain); - #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 481e4f12aeb4..2209c2ddacbf 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -15,6 +15,7 @@ #include <linux/file.h> /* for fput */ #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> /* * The ATM queuing discipline provides a framework for invoking classifiers diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index f1207582cbf3..d6ca18dc04c3 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -19,6 +19,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> /* Class-Based Queueing (CBQ) algorithm. diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 3b6d5bd69101..3b86a97bc67c 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -16,6 +16,7 @@ #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/inet_ecn.h> #include <net/red.h> #include <net/flow_dissector.h> diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 1308bbf460f7..802ac7c2e5e8 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -13,6 +13,7 @@ #include <linux/rtnetlink.h> #include <linux/bitops.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <asm/byteorder.h> diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a5ea0e9b6be4..9f3a884d1590 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -23,6 +23,7 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/codel.h> #include <net/codel_impl.h> #include <net/codel_qdisc.h> @@ -57,7 +58,6 @@ struct fq_codel_sched_data { struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ - u32 perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_batch_size; u32 memory_limit; @@ -75,9 +75,7 @@ struct fq_codel_sched_data { static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, struct sk_buff *skb) { - u32 hash = skb_get_hash_perturb(skb, q->perturbation); - - return reciprocal_scale(hash, q->flows_cnt); + return reciprocal_scale(skb_get_hash(skb), q->flows_cnt); } static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -482,7 +480,6 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) q->memory_limit = 32 << 20; /* 32 MBytes */ q->drop_batch_size = 64; q->quantum = psched_mtu(qdisc_dev(sch)); - q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); codel_params_init(&q->cparams); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6eb9c8e88519..b052b27a984e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = weight_p; + int quota = dev_tx_weight; int packets; while (qdisc_restart(q, &packets)) { diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index e3d0458af17b..2fae8b5f1b80 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -627,7 +627,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * sizeof(u32)); if (!q->hhf_arrays[i]) { - hhf_destroy(sch); + /* Note: hhf_destroy() will be called + * by our caller. + */ return -ENOMEM; } } @@ -638,7 +640,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE); if (!q->hhf_valid_bits[i]) { - hhf_destroy(sch); + /* Note: hhf_destroy() will be called + * by our caller. + */ return -ENOMEM; } } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 760f39e7caee..4cd5fb134bc9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -40,6 +40,7 @@ #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> /* HTB algorithm. Author: devik@cdi.cz diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 8fe6999b642a..3bab5f66c392 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -16,6 +16,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 2bc8d7f8df16..20b7f1646f69 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -52,7 +52,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) /* pre-allocate qdiscs, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); - if (priv->qdiscs == NULL) + if (!priv->qdiscs) return -ENOMEM; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { @@ -60,18 +60,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); - if (qdisc == NULL) - goto err; + if (!qdisc) + return -ENOMEM; priv->qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; return 0; - -err: - mq_destroy(sch); - return -ENOMEM; } static void mq_attach(struct Qdisc *sch) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b5c502c78143..922683418e53 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -118,10 +118,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); - if (priv->qdiscs == NULL) { - err = -ENOMEM; - goto err; - } + if (!priv->qdiscs) + return -ENOMEM; for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); @@ -129,10 +127,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); - if (qdisc == NULL) { - err = -ENOMEM; - goto err; - } + if (!qdisc) + return -ENOMEM; + priv->qdiscs[i] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } @@ -148,7 +145,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) priv->hw_owned = 1; err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) - goto err; + return err; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -162,10 +159,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_MQROOT; return 0; - -err: - mqprio_destroy(sch); - return err; } static void mqprio_attach(struct Qdisc *sch) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 9ffbb025b37e..e7839a0d0eaa 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -25,7 +25,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> - +#include <net/pkt_cls.h> struct multiq_sched_data { u16 bands; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bcfadfdea8e0..c8bb62a1e744 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -626,7 +626,7 @@ deliver: * If it's at ingress let's pretend the delay is * from the network (tstamp will be updated). */ - if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) + if (skb->tc_redirected && skb->tc_from_ingress) skb->tstamp = 0; #endif diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 8f575899adfa..d4d7db267b6e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -20,7 +20,7 @@ #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> - +#include <net/pkt_cls.h> struct prio_sched_data { int bands; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 20a350bd1b1d..fe6963d21519 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -25,6 +25,7 @@ #include <linux/jhash.h> #include <net/ip.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/inet_ecn.h> /* diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 7f195ed4d568..42e8c8615e65 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -23,6 +23,7 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/red.h> @@ -742,9 +743,10 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); if (!q->ht || !q->slots) { - sfq_destroy(sch); + /* Note: sfq_destroy() will be called by our caller */ return -ENOMEM; } + for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index b0196366d58d..9fe6b427afed 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev) return 0; } -static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void teql_master_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct teql_master *m = netdev_priv(dev); @@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, stats->tx_bytes = m->tx_bytes; stats->tx_errors = m->tx_errors; stats->tx_dropped = m->tx_dropped; - return stats; } static int teql_master_mtu(struct net_device *dev, int new_mtu) diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 6c4f7496cec6..70f1b570bab9 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o \ + output.o input.o debug.o stream.o auth.o \ offload.o sctp_probe-y := probe.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index d3cc30c25c41..2a6835b4562b 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -207,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a * association to the same value as the initial TSN. */ asoc->addip_serial = asoc->c.initial_tsn; + asoc->strreset_outseq = asoc->c.initial_tsn; INIT_LIST_HEAD(&asoc->addip_chunk_list); INIT_LIST_HEAD(&asoc->asconf_ack_list); @@ -269,6 +270,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->active_key_id = ep->active_key_id; asoc->prsctp_enable = ep->prsctp_enable; + asoc->reconf_enable = ep->reconf_enable; + asoc->strreset_enable = ep->strreset_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) @@ -358,8 +361,11 @@ void sctp_association_free(struct sctp_association *asoc) sctp_tsnmap_free(&asoc->peer.tsn_map); - /* Free ssnmap storage. */ - sctp_ssnmap_free(asoc->ssnmap); + /* Free stream information. */ + sctp_stream_free(asoc->stream); + + if (asoc->strreset_chunk) + sctp_chunk_free(asoc->strreset_chunk); /* Clean up the bound address list. */ sctp_bind_addr_free(&asoc->base.bind_addr); @@ -519,6 +525,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, if (asoc->peer.last_data_from == peer) asoc->peer.last_data_from = transport; + if (asoc->strreset_chunk && + asoc->strreset_chunk->transport == peer) { + asoc->strreset_chunk->transport = transport; + sctp_transport_reset_reconf_timer(transport); + } + /* If we remove the transport an INIT was last sent to, set it to * NULL. Combined with the update of the retran path above, this * will cause the next INIT to be sent to the next available @@ -820,8 +832,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, if (transport->state != SCTP_UNCONFIRMED) transport->state = SCTP_INACTIVE; else { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); ulp_notify = false; } @@ -1137,7 +1148,7 @@ void sctp_assoc_update(struct sctp_association *asoc, /* Reinitialize SSN for both local streams * and peer's streams. */ - sctp_ssnmap_clear(asoc->ssnmap); + sctp_stream_clear(asoc->stream); /* Flush the ULP reassembly and ordered queue. * Any data there will now be stale and will @@ -1162,10 +1173,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->ssnmap) { - /* Move the ssnmap. */ - asoc->ssnmap = new->ssnmap; - new->ssnmap = NULL; + if (!asoc->stream) { + asoc->stream = new->stream; + new->stream = NULL; } if (!asoc->assoc_id) { diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 615f0ddd41df..e3621cb4827f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { - int max, whole, i, offset, over, err; - int len, first_len; - int max_data; + size_t len, first_len, max_data, remaining; + size_t msg_len = iov_iter_count(from); + struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; - struct list_head *pos, *temp; - size_t msg_len = iov_iter_count(from); - __u8 frag; + int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) @@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + - msecs_to_jiffies(sinfo->sinfo_timetolive); + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet @@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); max_data = SCTP_TRUNC4(max_data); - max = asoc->frag_point; /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. @@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, hmac_desc->hmac_len); } - /* Now, check if we need to reduce our max */ - if (max > max_data) - max = max_data; + /* Check what's our max considering the above */ + max_data = min_t(size_t, max_data, asoc->frag_point); - whole = 0; - first_len = max; + /* Set first_len and then account for possible bundles on first frag */ + first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. @@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && - msg_len > max) - max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); + msg_len > max_data) + first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) - max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; - - /* Now that we adjusted completely, reset first_len */ - if (first_len > max_data) - first_len = max_data; + first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { - msg_len -= first_len; - whole = 1; msg->can_delay = 0; - } - - /* How many full sized? How many bytes leftover? */ - whole += msg_len / max; - over = msg_len % max; - offset = 0; - - if ((whole > 1) || (whole && over)) SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); + } else { + /* Which may be the only one... */ + first_len = msg_len; + } - /* Create chunks for all the full sized DATA chunks. */ - for (i = 0, len = first_len; i < whole; i++) { - frag = SCTP_DATA_MIDDLE_FRAG; + /* Create chunks for all DATA chunks. */ + for (remaining = msg_len; remaining; remaining -= len) { + u8 frag = SCTP_DATA_MIDDLE_FRAG; - if (0 == i) + if (remaining == msg_len) { + /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; + len = first_len; + } else { + /* Middle frags */ + len = max_data; + } - if ((i == (whole - 1)) && !over) { + if (len >= remaining) { + /* Last frag, which may also be the first */ + len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the @@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0, GFP_KERNEL); - if (!chunk) { err = -ENOMEM; goto errout; @@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, goto errout_chunk_free; /* Put the chunk->skb back into the form expected by send. */ - __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - - (__u8 *)chunk->skb->data); - - sctp_datamsg_assign(msg, chunk); - list_add_tail(&chunk->frag_list, &msg->chunks); - - /* The first chunk, the first chunk was likely short - * to allow bundling, so reset to full size. - */ - if (0 == i) - len = max; - } - - /* .. now the leftover bytes. */ - if (over) { - if (!whole) - frag = SCTP_DATA_NOT_FRAG; - else - frag = SCTP_DATA_LAST_FRAG; - - if ((sinfo->sinfo_flags & SCTP_EOF) || - (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) - frag |= SCTP_DATA_SACK_IMM; - - chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, - 0, GFP_KERNEL); - - if (!chunk) { - err = -ENOMEM; - goto errout; - } - - err = sctp_user_addto_chunk(chunk, over, from); - - /* Put the chunk->skb back into the form expected by send. */ - __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - - (__u8 *)chunk->skb->data); - if (err < 0) - goto errout_chunk_free; + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - + chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); @@ -338,6 +294,7 @@ errout: sctp_chunk_free(chunk); } sctp_datamsg_put(msg); + return ERR_PTR(err); } diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 95d7b15dad21..2e47eb2f05cb 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -159,6 +159,7 @@ static const char *const sctp_timer_tbl[] = { "TIMEOUT_T4_RTO", "TIMEOUT_T5_SHUTDOWN_GUARD", "TIMEOUT_HEARTBEAT", + "TIMEOUT_RECONF", "TIMEOUT_SACK", "TIMEOUT_AUTOCLOSE", }; @@ -166,7 +167,9 @@ static const char *const sctp_timer_tbl[] = { /* Lookup timer debug name. */ const char *sctp_tname(const sctp_subtype_t id) { - if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) + BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl)); + + if (id.timeout < ARRAY_SIZE(sctp_timer_tbl)) return sctp_timer_tbl[id.timeout]; return "unknown_timer"; } diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 410ddc1e3443..8c589230794f 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, ep->auth_hmacs_list = auth_hmacs; ep->auth_chunk_list = auth_chunks; ep->prsctp_enable = net->sctp.prsctp_enable; + ep->reconf_enable = net->sctp.reconf_enable; return ep; diff --git a/net/sctp/input.c b/net/sctp/input.c index 458e506ef84b..2a28ab20487f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -872,6 +872,8 @@ void sctp_transport_hashtable_destroy(void) int sctp_hash_transport(struct sctp_transport *t) { + struct sctp_transport *transport; + struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; @@ -882,8 +884,22 @@ int sctp_hash_transport(struct sctp_transport *t) arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); + rcu_read_lock(); + list = rhltable_lookup(&sctp_transport_hashtable, &arg, + sctp_hash_params); + + rhl_for_each_entry_rcu(transport, tmp, list, node) + if (transport->asoc->ep == t->asoc->ep) { + rcu_read_unlock(); + err = -EEXIST; + goto out; + } + rcu_read_unlock(); + err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); + +out: if (err) pr_err_once("insert transport fail, errno %d\n", err); @@ -1229,13 +1245,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + if (asoc) + goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ - if (!asoc) - asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); + asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); + if (asoc) + goto out; + + if (paddr->sa.sa_family == AF_INET) + pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", + &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), + &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); + else + pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", + &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), + &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); +out: return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 5ed8e79bf102..063baac5b9fe 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); rcu_read_lock(); - res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } @@ -412,22 +413,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { - __be16 *port; - struct sctphdr *sh; + /* Always called on head skb, so this is safe */ + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in6 *sa = &addr->v6; - port = &addr->v6.sin6_port; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; - /* Always called on head skb, so this is safe */ - sh = sctp_hdr(skb); if (is_saddr) { - *port = sh->source; - addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; + sa->sin6_port = sh->source; + sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { - *port = sh->dest; - addr->v6.sin6_addr = ipv6_hdr(skb)->daddr; + sa->sin6_port = sh->dest; + sa->sin6_addr = ipv6_hdr(skb)->daddr; } } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index 40e7fac96c41..105ac3327b28 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr); SCTP_DBG_OBJCNT(bind_bucket); SCTP_DBG_OBJCNT(chunk); SCTP_DBG_OBJCNT(addr); -SCTP_DBG_OBJCNT(ssnmap); SCTP_DBG_OBJCNT(datamsg); SCTP_DBG_OBJCNT(keys); @@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = { SCTP_DBG_OBJCNT_ENTRY(bind_addr), SCTP_DBG_OBJCNT_ENTRY(bind_bucket), SCTP_DBG_OBJCNT_ENTRY(addr), - SCTP_DBG_OBJCNT_ENTRY(ssnmap), SCTP_DBG_OBJCNT_ENTRY(datamsg), SCTP_DBG_OBJCNT_ENTRY(keys), }; diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 7e869d0cca69..4f5a2b580aa5 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, goto out; } - segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG); if (IS_ERR(segs)) goto out; diff --git a/net/sctp/output.c b/net/sctp/output.c index f5320a87341e..71ce6b945dcb 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -81,8 +81,8 @@ static void sctp_packet_reset(struct sctp_packet *packet) /* Config a packet. * This appears to be a followup set of initializations. */ -struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, - __u32 vtag, int ecn_capable) +void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, + int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; @@ -123,14 +123,12 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, if (chunk) sctp_packet_append_chunk(packet, chunk); } - - return packet; } /* Initialize the packet structure. */ -struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, - struct sctp_transport *transport, - __u16 sport, __u16 dport) +void sctp_packet_init(struct sctp_packet *packet, + struct sctp_transport *transport, + __u16 sport, __u16 dport) { struct sctp_association *asoc = transport->asoc; size_t overhead; @@ -151,8 +149,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, packet->overhead = overhead; sctp_packet_reset(packet); packet->vtag = 0; - - return packet; } /* Free a packet. */ @@ -181,7 +177,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, { sctp_xmit_t retval; - pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, + pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { @@ -550,6 +546,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; + int confirm; struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; @@ -628,7 +625,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(head, tp); + confirm = tp->dst_pending_confirm; + if (confirm) + skb_set_dst_pending_confirm(head, 1); + /* neighbour should be confirmed on successful transmission or + * positive error + */ + if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm) + tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { @@ -700,18 +704,15 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, * unacknowledged. */ - if (sctp_sk(asoc->base.sk)->nodelay) - /* Nagle disabled */ + if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && + !chunk->msg->force_delay) + /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; - if (inflight == 0) - /* Nothing unacked */ - return SCTP_XMIT_OK; - if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e54082699520..db352e5d61f8 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -915,22 +915,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: + case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(packet, chunk, one_packet, gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &q->control_chunk_list); - } else { - asoc->stats.octrlchunks++; - /* PR-SCTP C5) If a FORWARD TSN is sent, the - * sender MUST assure that at least one T3-rtx - * timer is running. - */ - if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { - sctp_transport_reset_t3_rtx(transport); - transport->last_time_sent = jiffies; - } + break; + } + + asoc->stats.octrlchunks++; + /* PR-SCTP C5) If a FORWARD TSN is sent, the + * sender MUST assure that at least one T3-rtx + * timer is running. + */ + if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + sctp_transport_reset_t3_rtx(transport); + transport->last_time_sent = jiffies; } + + if (chunk == asoc->strreset_chunk) + sctp_transport_reset_reconf_timer(transport); + break; default: @@ -1016,6 +1022,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + __u32 sid = ntohs(chunk->subh.data_hdr->stream); + /* RFC 2960 6.5 Every DATA chunk MUST carry a valid * stream identifier. */ @@ -1038,6 +1046,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) continue; } + if (asoc->stream->out[sid].state == SCTP_STREAM_CLOSED) { + sctp_outq_head_data(q, chunk); + goto sctp_flush_out; + } + /* If there is a specified transport, use it. * Otherwise, we want to use the active path. */ @@ -1048,7 +1061,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) (new_transport->state == SCTP_PF))) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { - WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); + WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1641,7 +1654,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, if (forward_progress) { if (transport->dst) - dst_confirm(transport->dst); + sctp_transport_dst_confirm(transport); } } diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index ab8d9f96a177..f0553a022859 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT); */ DECLARE_PRIMITIVE(ASCONF); + +/* RE-CONFIG 5.1 */ +DECLARE_PRIMITIVE(RECONF); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 616a9428e0c4..1b6d4574d2b0 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, sctp_scope_t scope, gfp_t gfp, int copy_flags) { struct sctp_sockaddr_entry *addr; + union sctp_addr laddr; int error = 0; rcu_read_lock(); @@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, !(copy_flags & SCTP_ADDR6_PEERSUPP))) continue; - if (sctp_bind_addr_state(bp, &addr->a) != -1) + laddr = addr->a; + /* also works for setting ipv6 address port */ + laddr.v4.sin_port = htons(bp->port); + if (sctp_bind_addr_state(bp, &laddr) != -1) continue; error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), @@ -237,23 +241,19 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { - void *from; - __be16 *port; - struct sctphdr *sh; + /* Always called on head skb, so this is safe */ + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in *sa = &addr->v4; - port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; - /* Always called on head skb, so this is safe */ - sh = sctp_hdr(skb); if (is_saddr) { - *port = sh->source; - from = &ip_hdr(skb)->saddr; + sa->sin_port = sh->source; + sa->sin_addr.s_addr = ip_hdr(skb)->saddr; } else { - *port = sh->dest; - from = &ip_hdr(skb)->daddr; + sa->sin_port = sh->dest; + sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } - memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr)); } /* Initialize an sctp_addr from a socket. */ @@ -1262,6 +1262,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Enable PR-SCTP by default. */ net->sctp.prsctp_enable = 1; + /* Disable RECONF by default. */ + net->sctp.reconf_enable = 0; + /* Disable AUTH by default. */ net->sctp.auth_enable = 0; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9e9690b7afe1..969a30c7bb54 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_ext += 2; } + if (asoc->reconf_enable) { + extensions[num_ext] = SCTP_CID_RECONF; + num_ext += 1; + } + if (sp->adaptation_ind) chunksize += sizeof(aiparam); @@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, num_ext += 2; } + if (asoc->peer.reconf_capable) { + extensions[num_ext] = SCTP_CID_RECONF; + num_ext += 1; + } + if (sp->adaptation_ind) chunksize += sizeof(aiparam); @@ -1536,7 +1546,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); - stream = &chunk->asoc->ssnmap->out; + stream = chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. @@ -1547,9 +1557,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) - ssn = sctp_ssn_next(stream, sid); + ssn = sctp_ssn_next(stream, out, sid); else - ssn = sctp_ssn_peek(stream, sid); + ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); @@ -1844,6 +1854,7 @@ no_hmac: retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; + retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; @@ -2011,6 +2022,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { + case SCTP_CID_RECONF: + if (asoc->reconf_enable && + !asoc->peer.reconf_capable) + asoc->peer.reconf_capable = 1; + break; case SCTP_CID_FWD_TSN: if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) asoc->peer.prsctp_capable = 1; @@ -2387,6 +2403,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); + asoc->strreset_inseq = asoc->peer.i.initial_tsn; + /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ @@ -2444,9 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, if (!asoc->temp) { int error; - asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams, + asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams, asoc->c.sinit_num_ostreams, gfp); - if (!asoc->ssnmap) + if (!asoc->stream) goto clean_up; error = sctp_assoc_set_id(asoc, gfp); @@ -3210,7 +3228,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, union sctp_params param; sctp_addiphdr_t *hdr; union sctp_addr_param *addr_param; - sctp_addip_param_t *asconf_param; struct sctp_chunk *asconf_ack; __be16 err_code; int length = 0; @@ -3230,7 +3247,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, * asconf parameter. */ length = ntohs(addr_param->p.length); - asconf_param = (void *)addr_param + length; chunk_len -= length; /* create an ASCONF_ACK chunk. @@ -3317,8 +3333,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: @@ -3332,8 +3347,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); } break; default: @@ -3526,3 +3540,323 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, return retval; } + +/* RE-CONFIG 3.1 (RE-CONFIG chunk) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 130 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Re-configuration Parameter / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Re-configuration Parameter (optional) / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +static struct sctp_chunk *sctp_make_reconf( + const struct sctp_association *asoc, + int length) +{ + struct sctp_reconf_chunk *reconf; + struct sctp_chunk *retval; + + retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, + GFP_ATOMIC); + if (!retval) + return NULL; + + reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; + retval->param_hdr.v = reconf->params; + + return retval; +} + +/* RE-CONFIG 4.1 (STREAM OUT RESET) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Request Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Response Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Sender's Last Assigned TSN | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Stream Number 1 (optional) | Stream Number 2 (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * / ...... / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Stream Number N-1 (optional) | Stream Number N (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * RE-CONFIG 4.2 (STREAM IN RESET) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Request Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Stream Number 1 (optional) | Stream Number 2 (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * / ...... / + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Stream Number N-1 (optional) | Stream Number N (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_strreset_req( + const struct sctp_association *asoc, + __u16 stream_num, __u16 *stream_list, + bool out, bool in) +{ + struct sctp_strreset_outreq outreq; + __u16 stream_len = stream_num * 2; + struct sctp_strreset_inreq inreq; + struct sctp_chunk *retval; + __u16 outlen, inlen; + + outlen = (sizeof(outreq) + stream_len) * out; + inlen = (sizeof(inreq) + stream_len) * in; + + retval = sctp_make_reconf(asoc, outlen + inlen); + if (!retval) + return NULL; + + if (outlen) { + outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; + outreq.param_hdr.length = htons(outlen); + outreq.request_seq = htonl(asoc->strreset_outseq); + outreq.response_seq = htonl(asoc->strreset_inseq - 1); + outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); + + sctp_addto_chunk(retval, sizeof(outreq), &outreq); + + if (stream_len) + sctp_addto_chunk(retval, stream_len, stream_list); + } + + if (inlen) { + inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; + inreq.param_hdr.length = htons(inlen); + inreq.request_seq = htonl(asoc->strreset_outseq + out); + + sctp_addto_chunk(retval, sizeof(inreq), &inreq); + + if (stream_len) + sctp_addto_chunk(retval, stream_len, stream_list); + } + + return retval; +} + +/* RE-CONFIG 4.3 (SSN/TSN RESET ALL) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 15 | Parameter Length = 8 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Request Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_strreset_tsnreq( + const struct sctp_association *asoc) +{ + struct sctp_strreset_tsnreq tsnreq; + __u16 length = sizeof(tsnreq); + struct sctp_chunk *retval; + + retval = sctp_make_reconf(asoc, length); + if (!retval) + return NULL; + + tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; + tsnreq.param_hdr.length = htons(length); + tsnreq.request_seq = htonl(asoc->strreset_outseq); + + sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); + + return retval; +} + +/* RE-CONFIG 4.5/4.6 (ADD STREAM) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 17 | Parameter Length = 12 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Request Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Number of new streams | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_strreset_addstrm( + const struct sctp_association *asoc, + __u16 out, __u16 in) +{ + struct sctp_strreset_addstrm addstrm; + __u16 size = sizeof(addstrm); + struct sctp_chunk *retval; + + retval = sctp_make_reconf(asoc, (!!out + !!in) * size); + if (!retval) + return NULL; + + if (out) { + addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; + addstrm.param_hdr.length = htons(size); + addstrm.number_of_streams = htons(out); + addstrm.request_seq = htonl(asoc->strreset_outseq); + addstrm.reserved = 0; + + sctp_addto_chunk(retval, size, &addstrm); + } + + if (in) { + addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; + addstrm.param_hdr.length = htons(size); + addstrm.number_of_streams = htons(in); + addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); + addstrm.reserved = 0; + + sctp_addto_chunk(retval, size, &addstrm); + } + + return retval; +} + +/* RE-CONFIG 4.4 (RESP) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 16 | Parameter Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Response Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Result | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_strreset_resp( + const struct sctp_association *asoc, + __u32 result, __u32 sn) +{ + struct sctp_strreset_resp resp; + __u16 length = sizeof(resp); + struct sctp_chunk *retval; + + retval = sctp_make_reconf(asoc, length); + if (!retval) + return NULL; + + resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; + resp.param_hdr.length = htons(length); + resp.response_seq = htonl(sn); + resp.result = htonl(result); + + sctp_addto_chunk(retval, sizeof(resp), &resp); + + return retval; +} + +/* RE-CONFIG 4.4 OPTIONAL (TSNRESP) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Parameter Type = 16 | Parameter Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Re-configuration Response Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Result | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Sender's Next TSN (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Receiver's Next TSN (optional) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct sctp_chunk *sctp_make_strreset_tsnresp( + struct sctp_association *asoc, + __u32 result, __u32 sn, + __u32 sender_tsn, __u32 receiver_tsn) +{ + struct sctp_strreset_resptsn tsnresp; + __u16 length = sizeof(tsnresp); + struct sctp_chunk *retval; + + retval = sctp_make_reconf(asoc, length); + if (!retval) + return NULL; + + tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; + tsnresp.param_hdr.length = htons(length); + + tsnresp.response_seq = htonl(sn); + tsnresp.result = htonl(result); + tsnresp.senders_next_tsn = htonl(sender_tsn); + tsnresp.receivers_next_tsn = htonl(receiver_tsn); + + sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); + + return retval; +} + +bool sctp_verify_reconf(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + struct sctp_paramhdr **errp) +{ + struct sctp_reconf_chunk *hdr; + union sctp_params param; + __u16 last = 0, cnt = 0; + + hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; + sctp_walk_params(param, hdr, params) { + __u16 length = ntohs(param.p->length); + + *errp = param.p; + if (cnt++ > 2) + return false; + switch (param.p->type) { + case SCTP_PARAM_RESET_OUT_REQUEST: + if (length < sizeof(struct sctp_strreset_outreq) || + (last && last != SCTP_PARAM_RESET_RESPONSE && + last != SCTP_PARAM_RESET_IN_REQUEST)) + return false; + break; + case SCTP_PARAM_RESET_IN_REQUEST: + if (length < sizeof(struct sctp_strreset_inreq) || + (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) + return false; + break; + case SCTP_PARAM_RESET_RESPONSE: + if ((length != sizeof(struct sctp_strreset_resp) && + length != sizeof(struct sctp_strreset_resptsn)) || + (last && last != SCTP_PARAM_RESET_RESPONSE && + last != SCTP_PARAM_RESET_OUT_REQUEST)) + return false; + break; + case SCTP_PARAM_RESET_TSN_REQUEST: + if (length != + sizeof(struct sctp_strreset_tsnreq) || last) + return false; + break; + case SCTP_PARAM_RESET_ADD_IN_STREAMS: + if (length != sizeof(struct sctp_strreset_addstrm) || + (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) + return false; + break; + case SCTP_PARAM_RESET_ADD_OUT_STREAMS: + if (length != sizeof(struct sctp_strreset_addstrm) || + (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) + return false; + break; + default: + return false; + } + + last = param.p->type; + } + + return true; +} diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index c345bf153bed..25384fa82ba9 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -436,6 +436,37 @@ out_unlock: sctp_association_put(asoc); } + /* Handle the timeout of the RE-CONFIG timer. */ +void sctp_generate_reconf_event(unsigned long data) +{ + struct sctp_transport *transport = (struct sctp_transport *)data; + struct sctp_association *asoc = transport->asoc; + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + int error = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + pr_debug("%s: sock is busy\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF), + asoc->state, asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + sk->sk_err = -error; + +out_unlock: + bh_unlock_sock(sk); + sctp_transport_put(transport); +} /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(unsigned long data) @@ -453,6 +484,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { sctp_generate_t4_rto_event, sctp_generate_t5_shutdown_guard_event, NULL, + NULL, sctp_generate_sack_event, sctp_generate_autoclose_event, }; @@ -723,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, * forward progress. */ if (t->dst) - dst_confirm(t->dst); + sctp_transport_dst_confirm(t); /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address @@ -840,6 +872,10 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, if (!sctp_style(sk, UDP)) sk->sk_state_change(sk); } + + if (sctp_state(asoc, SHUTDOWN_PENDING) && + !sctp_outq_is_empty(&asoc->outqueue)) + sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC); } /* Helper function to delete an association. */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8ec20a64a3f8..e03bb1aab4d0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net, /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument * is set to be the size of a specific chunk we are testing. - * Return Values: 1 = Valid length - * 0 = Invalid length + * Return Values: true = Valid length + * false = Invalid length * */ -static inline int -sctp_chunk_length_valid(struct sctp_chunk *chunk, - __u16 required_length) +static inline bool +sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length) { __u16 chunk_length = ntohs(chunk->chunk_hdr->length); /* Previously already marked? */ if (unlikely(chunk->pdiscard)) - return 0; + return false; if (unlikely(chunk_length < required_length)) - return 0; + return false; - return 1; + return true; } /********************************************************** @@ -1022,6 +1021,34 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net, return SCTP_DISPOSITION_CONSUME; } +/* resend asoc strreset_chunk. */ +sctp_disposition_t sctp_sf_send_reconf(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = arg; + + if (asoc->overall_error_count >= asoc->max_retrans) { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + sctp_chunk_hold(asoc->strreset_chunk); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(asoc->strreset_chunk)); + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + /* * Process an heartbeat request. * @@ -3237,36 +3264,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net, struct sctp_chunk *abort; packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; - if (packet) { - /* Make an ABORT. The T bit will be set if the asoc - * is NULL. - */ - abort = sctp_make_abort(asoc, chunk, 0); - if (!abort) { - sctp_ootb_pkt_free(packet); - return SCTP_DISPOSITION_NOMEM; - } - - /* Reflect vtag if T-Bit is set */ - if (sctp_test_T_bit(abort)) - packet->vtag = ntohl(chunk->sctp_hdr->vtag); + /* Make an ABORT. The T bit will be set if the asoc + * is NULL. + */ + abort = sctp_make_abort(asoc, chunk, 0); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } - /* Set the skb to the belonging sock for accounting. */ - abort->skb->sk = ep->base.sk; + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(abort)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); - sctp_packet_append_chunk(packet, abort); + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, - SCTP_PACKET(packet)); + sctp_packet_append_chunk(packet, abort); - SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); - sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - return SCTP_DISPOSITION_CONSUME; - } + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - return SCTP_DISPOSITION_NOMEM; + sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; } /* @@ -3503,45 +3528,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net, struct sctp_chunk *shut; packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; - if (packet) { - /* Make an SHUTDOWN_COMPLETE. - * The T bit will be set if the asoc is NULL. - */ - shut = sctp_make_shutdown_complete(asoc, chunk); - if (!shut) { - sctp_ootb_pkt_free(packet); - return SCTP_DISPOSITION_NOMEM; - } - - /* Reflect vtag if T-Bit is set */ - if (sctp_test_T_bit(shut)) - packet->vtag = ntohl(chunk->sctp_hdr->vtag); + /* Make an SHUTDOWN_COMPLETE. + * The T bit will be set if the asoc is NULL. + */ + shut = sctp_make_shutdown_complete(asoc, chunk); + if (!shut) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } - /* Set the skb to the belonging sock for accounting. */ - shut->skb->sk = ep->base.sk; + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(shut)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); - sctp_packet_append_chunk(packet, shut); + /* Set the skb to the belonging sock for accounting. */ + shut->skb->sk = ep->base.sk; - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, - SCTP_PACKET(packet)); + sctp_packet_append_chunk(packet, shut); - SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); - /* If the chunk length is invalid, we don't want to process - * the reset of the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - /* We need to discard the rest of the packet to prevent - * potential bomming attacks from additional bundled chunks. - * This is documented in SCTP Threats ID. - */ + /* If the chunk length is invalid, we don't want to process + * the reset of the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - } - return SCTP_DISPOSITION_NOMEM; + /* We need to discard the rest of the packet to prevent + * potential bomming attacks from additional bundled chunks. + * This is documented in SCTP Threats ID. + */ + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } /* @@ -3811,6 +3834,60 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net, return SCTP_DISPOSITION_DISCARD; } +/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */ +sctp_disposition_t sctp_sf_do_reconf(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_paramhdr *err_param = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_reconf_chunk *hdr; + union sctp_params param; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + + /* Make sure that the RECONF chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(*hdr))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + + if (!sctp_verify_reconf(asoc, chunk, &err_param)) + return sctp_sf_violation_paramlen(net, ep, asoc, type, arg, + (void *)err_param, commands); + + hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; + sctp_walk_params(param, hdr, params) { + struct sctp_chunk *reply = NULL; + struct sctp_ulpevent *ev = NULL; + + if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST) + reply = sctp_process_strreset_outreq( + (struct sctp_association *)asoc, param, &ev); + else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST) + reply = sctp_process_strreset_inreq( + (struct sctp_association *)asoc, param, &ev); + /* More handles for other types will be added here, by now it + * just ignores other types. + */ + + if (ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + + if (reply) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(reply)); + } + + return SCTP_DISPOSITION_CONSUME; +} + /* * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP * @@ -3844,6 +3921,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + if (!asoc->peer.prsctp_capable) + return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); + /* Make sure that the FORWARD_TSN chunk has valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -3912,6 +3992,9 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast( return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + if (!asoc->peer.prsctp_capable) + return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands); + /* Make sure that the FORWARD_TSN chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -5162,6 +5245,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net, return SCTP_DISPOSITION_CONSUME; } +/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */ +sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net, + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk)); + return SCTP_DISPOSITION_CONSUME; +} + /* * Ignore the primitive event * @@ -6036,8 +6132,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, sctp_sk(net->sctp.ctl_sock)); - packet = sctp_packet_init(&transport->packet, transport, sport, dport); - packet = sctp_packet_config(packet, vtag, 0); + packet = &transport->packet; + sctp_packet_init(packet, transport, sport, dport); + sctp_packet_config(packet, vtag, 0); return packet; @@ -6278,9 +6375,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, * and is invalid. */ ssn = ntohs(data_hdr->ssn); - if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) { + if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid))) return SCTP_IERROR_PROTO_VIOLATION; - } /* Send the data up to the user. Note: Schedule the * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index a987d54b379c..419b18ebb056 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -482,6 +482,32 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN TYPE_SCTP_FWD_TSN, }; /*state_fn_t prsctp_chunk_event_table[][] */ +#define TYPE_SCTP_RECONF { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_reconf), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_RECONF */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_RECONF, +}; /*state_fn_t reconf_chunk_event_table[][] */ + #define TYPE_SCTP_AUTH { \ /* SCTP_STATE_CLOSED */ \ TYPE_SCTP_FUNC(sctp_sf_ootb), \ @@ -643,6 +669,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_ASCONF */ +#define TYPE_SCTP_PRIMITIVE_RECONF { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ +} /* TYPE_SCTP_PRIMITIVE_RECONF */ + /* The primary index for this table is the primitive type. * The secondary index for this table is the state. */ @@ -653,6 +698,7 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE TYPE_SCTP_PRIMITIVE_SEND, TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, TYPE_SCTP_PRIMITIVE_ASCONF, + TYPE_SCTP_PRIMITIVE_RECONF, }; #define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ @@ -888,6 +934,25 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ } +#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_send_reconf), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { TYPE_SCTP_EVENT_TIMEOUT_NONE, TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, @@ -897,6 +962,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, + TYPE_SCTP_EVENT_TIMEOUT_RECONF, TYPE_SCTP_EVENT_TIMEOUT_SACK, TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, }; @@ -924,6 +990,10 @@ static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net, return &addip_chunk_event_table[1][state]; } + if (net->sctp.reconf_enable) + if (cid == SCTP_CID_RECONF) + return &reconf_chunk_event_table[0][state]; + if (net->sctp.auth_enable) { if (cid == SCTP_CID_AUTH) return &auth_chunk_event_table[0][state]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 318c6786d653..465a9c8464f9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -235,8 +235,12 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, sctp_assoc_t id) { struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; - struct sctp_transport *transport; + struct sctp_af *af = sctp_get_af_specific(addr->ss_family); union sctp_addr *laddr = (union sctp_addr *)addr; + struct sctp_transport *transport; + + if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len)) + return NULL; addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, laddr, @@ -360,7 +364,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) } } - if (snum && snum < PROT_SOCK && + if (snum && snum < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -588,7 +592,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk, list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { /* Clear the source and route cache */ - dst_release(trans->dst); + sctp_transport_dst_release(trans); trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); trans->ssthresh = asoc->peer.i.a_rwnd; @@ -839,7 +843,7 @@ skip_mkasconf: */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); + sctp_transport_dst_release(transport); sctp_transport_route(transport, NULL, sctp_sk(asoc->base.sk)); } @@ -1152,8 +1156,10 @@ static int __sctp_connect(struct sock *sk, * accept new associations, but it SHOULD NOT * be permitted to open new associations. */ - if (ep->base.bind_addr.port < PROT_SOCK && - !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { + if (ep->base.bind_addr.port < + inet_prot_sock(net) && + !ns_capable(net->user_ns, + CAP_NET_BIND_SERVICE)) { err = -EACCES; goto out_free; } @@ -1818,7 +1824,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * but it SHOULD NOT be permitted to open new * associations. */ - if (ep->base.bind_addr.port < PROT_SOCK && + if (ep->base.bind_addr.port < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { err = -EACCES; goto out_unlock; @@ -1958,6 +1964,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) err = PTR_ERR(datamsg); goto out_free; } + datamsg->force_delay = !!(msg->msg_flags & MSG_MORE); /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { @@ -2430,7 +2437,6 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); } else if (asoc) { asoc->pathmtu = params->spp_pathmtu; - sctp_frag_point(asoc, params->spp_pathmtu); } else { sp->pathmtu = params->spp_pathmtu; } @@ -3751,6 +3757,120 @@ out: return retval; } +static int sctp_setsockopt_enable_strreset(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->strreset_enable = params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->strreset_enable = params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + +static int sctp_setsockopt_reset_streams(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_reset_streams *params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen < sizeof(struct sctp_reset_streams)) + return -EINVAL; + + params = memdup_user(optval, optlen); + if (IS_ERR(params)) + return PTR_ERR(params); + + asoc = sctp_id2assoc(sk, params->srs_assoc_id); + if (!asoc) + goto out; + + retval = sctp_send_reset_streams(asoc, params); + +out: + kfree(params); + return retval; +} + +static int sctp_setsockopt_reset_assoc(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + sctp_assoc_t associd; + int retval = -EINVAL; + + if (optlen != sizeof(associd)) + goto out; + + if (copy_from_user(&associd, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, associd); + if (!asoc) + goto out; + + retval = sctp_send_reset_assoc(asoc); + +out: + return retval; +} + +static int sctp_setsockopt_add_streams(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_add_streams params; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.sas_assoc_id); + if (!asoc) + goto out; + + retval = sctp_send_add_streams(asoc, ¶ms); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3917,6 +4037,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_DEFAULT_PRINFO: retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); break; + case SCTP_ENABLE_STREAM_RESET: + retval = sctp_setsockopt_enable_strreset(sk, optval, optlen); + break; + case SCTP_RESET_STREAMS: + retval = sctp_setsockopt_reset_streams(sk, optval, optlen); + break; + case SCTP_RESET_ASSOC: + retval = sctp_setsockopt_reset_assoc(sk, optval, optlen); + break; + case SCTP_ADD_STREAMS: + retval = sctp_setsockopt_add_streams(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -4730,6 +4862,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) if (!asoc) return -EINVAL; + /* If there is a thread waiting on more sndbuf space for + * sending on this asoc, it cannot be peeled. + */ + if (waitqueue_active(&asoc->wait)) + return -EBUSY; + /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ @@ -6401,6 +6539,47 @@ out: return retval; } +static int sctp_getsockopt_enable_strreset(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->strreset_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->strreset_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6568,6 +6747,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; + case SCTP_ENABLE_STREAM_RESET: + retval = sctp_getsockopt_enable_strreset(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7422,7 +7605,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); - BUG_ON(sk != asoc->base.sk); lock_sock(sk); *timeo_p = current_timeo; diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c deleted file mode 100644 index b9c8521c1a98..000000000000 --- a/net/sctp/ssnmap.c +++ /dev/null @@ -1,125 +0,0 @@ -/* SCTP kernel implementation - * Copyright (c) 2003 International Business Machines, Corp. - * - * This file is part of the SCTP kernel implementation - * - * These functions manipulate sctp SSN tracker. - * - * This SCTP implementation is free software; - * you can redistribute it and/or modify it under the terms of - * the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This SCTP implementation is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; without even the implied - * ************************ - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, see - * <http://www.gnu.org/licenses/>. - * - * Please send any bug reports or fixes you make to the - * email address(es): - * lksctp developers <linux-sctp@vger.kernel.org> - * - * Written or modified by: - * Jon Grimm <jgrimm@us.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <net/sctp/sctp.h> -#include <net/sctp/sm.h> - -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, - __u16 out); - -/* Storage size needed for map includes 2 headers and then the - * specific needs of in or out streams. - */ -static inline size_t sctp_ssnmap_size(__u16 in, __u16 out) -{ - return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16); -} - - -/* Create a new sctp_ssnmap. - * Allocate room to store at least 'len' contiguous TSNs. - */ -struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, - gfp_t gfp) -{ - struct sctp_ssnmap *retval; - int size; - - size = sctp_ssnmap_size(in, out); - if (size <= KMALLOC_MAX_SIZE) - retval = kmalloc(size, gfp); - else - retval = (struct sctp_ssnmap *) - __get_free_pages(gfp, get_order(size)); - if (!retval) - goto fail; - - if (!sctp_ssnmap_init(retval, in, out)) - goto fail_map; - - SCTP_DBG_OBJCNT_INC(ssnmap); - - return retval; - -fail_map: - if (size <= KMALLOC_MAX_SIZE) - kfree(retval); - else - free_pages((unsigned long)retval, get_order(size)); -fail: - return NULL; -} - - -/* Initialize a block of memory as a ssnmap. */ -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, - __u16 out) -{ - memset(map, 0x00, sctp_ssnmap_size(in, out)); - - /* Start 'in' stream just after the map header. */ - map->in.ssn = (__u16 *)&map[1]; - map->in.len = in; - - /* Start 'out' stream just after 'in'. */ - map->out.ssn = &map->in.ssn[in]; - map->out.len = out; - - return map; -} - -/* Clear out the ssnmap streams. */ -void sctp_ssnmap_clear(struct sctp_ssnmap *map) -{ - size_t size; - - size = (map->in.len + map->out.len) * sizeof(__u16); - memset(map->in.ssn, 0x00, size); -} - -/* Dispose of a ssnmap. */ -void sctp_ssnmap_free(struct sctp_ssnmap *map) -{ - int size; - - if (unlikely(!map)) - return; - - size = sctp_ssnmap_size(map->in.len, map->out.len); - if (size <= KMALLOC_MAX_SIZE) - kfree(map); - else - free_pages((unsigned long)map, get_order(size)); - - SCTP_DBG_OBJCNT_DEC(ssnmap); -} diff --git a/net/sctp/stream.c b/net/sctp/stream.c new file mode 100644 index 000000000000..1c6cc04fa3a4 --- /dev/null +++ b/net/sctp/stream.c @@ -0,0 +1,479 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp tsn mapping array. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Xin Long <lucien.xin@gmail.com> + */ + +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> + +struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp) +{ + struct sctp_stream *stream; + int i; + + stream = kzalloc(sizeof(*stream), gfp); + if (!stream) + return NULL; + + stream->outcnt = outcnt; + stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); + if (!stream->out) { + kfree(stream); + return NULL; + } + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + stream->incnt = incnt; + stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp); + if (!stream->in) { + kfree(stream->out); + kfree(stream); + return NULL; + } + + return stream; +} + +void sctp_stream_free(struct sctp_stream *stream) +{ + if (unlikely(!stream)) + return; + + kfree(stream->out); + kfree(stream->in); + kfree(stream); +} + +void sctp_stream_clear(struct sctp_stream *stream) +{ + int i; + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].ssn = 0; + + for (i = 0; i < stream->incnt; i++) + stream->in[i].ssn = 0; +} + +static int sctp_send_reconf(struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct net *net = sock_net(asoc->base.sk); + int retval = 0; + + retval = sctp_primitive_RECONF(net, asoc, chunk); + if (retval) + sctp_chunk_free(chunk); + + return retval; +} + +int sctp_send_reset_streams(struct sctp_association *asoc, + struct sctp_reset_streams *params) +{ + struct sctp_stream *stream = asoc->stream; + __u16 i, str_nums, *str_list; + struct sctp_chunk *chunk; + int retval = -EINVAL; + bool out, in; + + if (!asoc->peer.reconf_capable || + !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { + retval = -ENOPROTOOPT; + goto out; + } + + if (asoc->strreset_outstanding) { + retval = -EINPROGRESS; + goto out; + } + + out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; + in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; + if (!out && !in) + goto out; + + str_nums = params->srs_number_streams; + str_list = params->srs_stream_list; + if (out && str_nums) + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->outcnt) + goto out; + + if (in && str_nums) + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->incnt) + goto out; + + for (i = 0; i < str_nums; i++) + str_list[i] = htons(str_list[i]); + + chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in); + + for (i = 0; i < str_nums; i++) + str_list[i] = ntohs(str_list[i]); + + if (!chunk) { + retval = -ENOMEM; + goto out; + } + + if (out) { + if (str_nums) + for (i = 0; i < str_nums; i++) + stream->out[str_list[i]].state = + SCTP_STREAM_CLOSED; + else + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_CLOSED; + } + + asoc->strreset_chunk = chunk; + sctp_chunk_hold(asoc->strreset_chunk); + + retval = sctp_send_reconf(asoc, chunk); + if (retval) { + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + if (!out) + goto out; + + if (str_nums) + for (i = 0; i < str_nums; i++) + stream->out[str_list[i]].state = + SCTP_STREAM_OPEN; + else + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + goto out; + } + + asoc->strreset_outstanding = out + in; + +out: + return retval; +} + +int sctp_send_reset_assoc(struct sctp_association *asoc) +{ + struct sctp_chunk *chunk = NULL; + int retval; + __u16 i; + + if (!asoc->peer.reconf_capable || + !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) + return -ENOPROTOOPT; + + if (asoc->strreset_outstanding) + return -EINPROGRESS; + + chunk = sctp_make_strreset_tsnreq(asoc); + if (!chunk) + return -ENOMEM; + + /* Block further xmit of data until this request is completed */ + for (i = 0; i < asoc->stream->outcnt; i++) + asoc->stream->out[i].state = SCTP_STREAM_CLOSED; + + asoc->strreset_chunk = chunk; + sctp_chunk_hold(asoc->strreset_chunk); + + retval = sctp_send_reconf(asoc, chunk); + if (retval) { + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + + for (i = 0; i < asoc->stream->outcnt; i++) + asoc->stream->out[i].state = SCTP_STREAM_OPEN; + + return retval; + } + + asoc->strreset_outstanding = 1; + + return 0; +} + +int sctp_send_add_streams(struct sctp_association *asoc, + struct sctp_add_streams *params) +{ + struct sctp_stream *stream = asoc->stream; + struct sctp_chunk *chunk = NULL; + int retval = -ENOMEM; + __u32 outcnt, incnt; + __u16 out, in; + + if (!asoc->peer.reconf_capable || + !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { + retval = -ENOPROTOOPT; + goto out; + } + + if (asoc->strreset_outstanding) { + retval = -EINPROGRESS; + goto out; + } + + out = params->sas_outstrms; + in = params->sas_instrms; + outcnt = stream->outcnt + out; + incnt = stream->incnt + in; + if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || + (!out && !in)) { + retval = -EINVAL; + goto out; + } + + if (out) { + struct sctp_stream_out *streamout; + + streamout = krealloc(stream->out, outcnt * sizeof(*streamout), + GFP_KERNEL); + if (!streamout) + goto out; + + memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); + stream->out = streamout; + } + + if (in) { + struct sctp_stream_in *streamin; + + streamin = krealloc(stream->in, incnt * sizeof(*streamin), + GFP_KERNEL); + if (!streamin) + goto out; + + memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); + stream->in = streamin; + } + + chunk = sctp_make_strreset_addstrm(asoc, out, in); + if (!chunk) + goto out; + + asoc->strreset_chunk = chunk; + sctp_chunk_hold(asoc->strreset_chunk); + + retval = sctp_send_reconf(asoc, chunk); + if (retval) { + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + goto out; + } + + stream->incnt = incnt; + stream->outcnt = outcnt; + + asoc->strreset_outstanding = !!out + !!in; + +out: + return retval; +} + +static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param( + struct sctp_association *asoc, __u32 resp_seq) +{ + struct sctp_chunk *chunk = asoc->strreset_chunk; + struct sctp_reconf_chunk *hdr; + union sctp_params param; + + if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk) + return NULL; + + hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; + sctp_walk_params(param, hdr, params) { + /* sctp_strreset_tsnreq is actually the basic structure + * of all stream reconf params, so it's safe to use it + * to access request_seq. + */ + struct sctp_strreset_tsnreq *req = param.v; + + if (req->request_seq == resp_seq) + return param.v; + } + + return NULL; +} + +struct sctp_chunk *sctp_process_strreset_outreq( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + struct sctp_strreset_outreq *outreq = param.v; + struct sctp_stream *stream = asoc->stream; + __u16 i, nums, flags = 0, *str_p = NULL; + __u32 result = SCTP_STRRESET_DENIED; + __u32 request_seq; + + request_seq = ntohl(outreq->request_seq); + + if (ntohl(outreq->send_reset_at_tsn) > + sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { + result = SCTP_STRRESET_IN_PROGRESS; + goto out; + } + + if (request_seq > asoc->strreset_inseq) { + result = SCTP_STRRESET_ERR_BAD_SEQNO; + goto out; + } else if (request_seq == asoc->strreset_inseq) { + asoc->strreset_inseq++; + } + + /* Check strreset_enable after inseq inc, as sender cannot tell + * the peer doesn't enable strreset after receiving response with + * result denied, as well as to keep consistent with bsd. + */ + if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) + goto out; + + if (asoc->strreset_chunk) { + sctp_paramhdr_t *param_hdr; + struct sctp_transport *t; + + param_hdr = sctp_chunk_lookup_strreset_param( + asoc, outreq->response_seq); + if (!param_hdr || param_hdr->type != + SCTP_PARAM_RESET_IN_REQUEST) { + /* same process with outstanding isn't 0 */ + result = SCTP_STRRESET_ERR_IN_PROGRESS; + goto out; + } + + asoc->strreset_outstanding--; + asoc->strreset_outseq++; + + if (!asoc->strreset_outstanding) { + t = asoc->strreset_chunk->transport; + if (del_timer(&t->reconf_timer)) + sctp_transport_put(t); + + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + } + + flags = SCTP_STREAM_RESET_INCOMING_SSN; + } + + nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2; + if (nums) { + str_p = outreq->list_of_streams; + for (i = 0; i < nums; i++) { + if (ntohs(str_p[i]) >= stream->incnt) { + result = SCTP_STRRESET_ERR_WRONG_SSN; + goto out; + } + } + + for (i = 0; i < nums; i++) + stream->in[ntohs(str_p[i])].ssn = 0; + } else { + for (i = 0; i < stream->incnt; i++) + stream->in[i].ssn = 0; + } + + result = SCTP_STRRESET_PERFORMED; + + *evp = sctp_ulpevent_make_stream_reset_event(asoc, + flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p, + GFP_ATOMIC); + +out: + return sctp_make_strreset_resp(asoc, result, request_seq); +} + +struct sctp_chunk *sctp_process_strreset_inreq( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + struct sctp_strreset_inreq *inreq = param.v; + struct sctp_stream *stream = asoc->stream; + __u32 result = SCTP_STRRESET_DENIED; + struct sctp_chunk *chunk = NULL; + __u16 i, nums, *str_p; + __u32 request_seq; + + request_seq = ntohl(inreq->request_seq); + if (request_seq > asoc->strreset_inseq) { + result = SCTP_STRRESET_ERR_BAD_SEQNO; + goto out; + } else if (request_seq == asoc->strreset_inseq) { + asoc->strreset_inseq++; + } + + if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) + goto out; + + if (asoc->strreset_outstanding) { + result = SCTP_STRRESET_ERR_IN_PROGRESS; + goto out; + } + + nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2; + str_p = inreq->list_of_streams; + for (i = 0; i < nums; i++) { + if (ntohs(str_p[i]) >= stream->outcnt) { + result = SCTP_STRRESET_ERR_WRONG_SSN; + goto out; + } + } + + chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); + if (!chunk) + goto out; + + if (nums) + for (i = 0; i < nums; i++) + stream->out[ntohs(str_p[i])].state = + SCTP_STREAM_CLOSED; + else + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_CLOSED; + + asoc->strreset_chunk = chunk; + asoc->strreset_outstanding = 1; + sctp_chunk_hold(asoc->strreset_chunk); + + *evp = sctp_ulpevent_make_stream_reset_event(asoc, + SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); + +out: + if (!chunk) + chunk = sctp_make_strreset_resp(asoc, result, request_seq); + + return chunk; +} diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a1652ab63918..3379668af368 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -88,9 +88,11 @@ static struct sctp_transport *sctp_transport_init(struct net *net, INIT_LIST_HEAD(&peer->transports); setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, - (unsigned long)peer); + (unsigned long)peer); setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, - (unsigned long)peer); + (unsigned long)peer); + setup_timer(&peer->reconf_timer, sctp_generate_reconf_event, + (unsigned long)peer); setup_timer(&peer->proto_unreach_timer, sctp_generate_proto_unreach_event, (unsigned long)peer); @@ -144,6 +146,9 @@ void sctp_transport_free(struct sctp_transport *transport) if (del_timer(&transport->T3_rtx_timer)) sctp_transport_put(transport); + if (del_timer(&transport->reconf_timer)) + sctp_transport_put(transport); + /* Delete the ICMP proto unreachable timer if it's active. */ if (del_timer(&transport->proto_unreach_timer)) sctp_association_put(transport->asoc); @@ -211,6 +216,14 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) sctp_transport_hold(transport); } +void sctp_transport_reset_reconf_timer(struct sctp_transport *transport) +{ + if (!timer_pending(&transport->reconf_timer)) + if (!mod_timer(&transport->reconf_timer, + jiffies + transport->rto)) + sctp_transport_hold(transport); +} + /* This transport has been assigned to an association. * Initialize fields from the association or from the sock itself. * Register the reference count in the association. @@ -227,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ if (!transport->dst || transport->dst->obsolete) { - dst_release(transport->dst); + sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); } @@ -630,9 +643,7 @@ void sctp_transport_reset(struct sctp_transport *t) t->srtt = 0; t->rttvar = 0; - /* Reset these additional varibles so that we have a clean - * slate. - */ + /* Reset these additional variables so that we have a clean slate. */ t->partial_bytes_acked = 0; t->flight_size = 0; t->error_count = 0; @@ -659,3 +670,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t) sctp_transport_hold(t); } } + +/* Drop dst */ +void sctp_transport_dst_release(struct sctp_transport *t) +{ + dst_release(t->dst); + t->dst = NULL; + t->dst_pending_confirm = 0; +} + +/* Schedule neighbour confirm */ +void sctp_transport_dst_confirm(struct sctp_transport *t) +{ + t->dst_pending_confirm = 1; +} diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index bea00058ce35..c8881bc542a0 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -854,6 +854,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( return event; } +struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( + const struct sctp_association *asoc, __u16 flags, __u16 stream_num, + __u16 *stream_list, gfp_t gfp) +{ + struct sctp_stream_reset_event *sreset; + struct sctp_ulpevent *event; + struct sk_buff *skb; + int length, i; + + length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num; + event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + sreset = (struct sctp_stream_reset_event *)skb_put(skb, length); + + sreset->strreset_type = SCTP_STREAM_RESET_EVENT; + sreset->strreset_flags = flags; + sreset->strreset_length = length; + sctp_ulpevent_set_owner(event, asoc); + sreset->strreset_assoc_id = sctp_assoc2id(asoc); + + for (i = 0; i < stream_num; i++) + sreset->strreset_stream_list[i] = ntohs(stream_list[i]); + + return event; +} + /* Return the notification type, assuming this is a notification * event. */ diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 84d0fdaf7de9..aa3624d50278 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; - struct sctp_stream *in; + struct sctp_stream *stream; __u16 sid, csid, cssn; sid = event->stream; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; @@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, if (csid < sid) continue; - if (cssn != sctp_ssn_peek(in, sid)) + if (cssn != sctp_ssn_peek(stream, in, sid)) break; - /* Found it, so mark in the ssnmap. */ - sctp_ssn_next(in, sid); + /* Found it, so mark in the stream. */ + sctp_ssn_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); @@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { __u16 sid, ssn; - struct sctp_stream *in; + struct sctp_stream *stream; /* Check if this message needs ordering. */ if (SCTP_DATA_UNORDERED & event->msg_flags) @@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, /* Note: The stream ID must be verified before this routine. */ sid = event->stream; ssn = event->ssn; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* Is this the expected SSN for this stream ID? */ - if (ssn != sctp_ssn_peek(in, sid)) { + if (ssn != sctp_ssn_peek(stream, in, sid)) { /* We've received something out of order, so find where it * needs to be placed. We order by stream and then by SSN. */ @@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, } /* Mark that the next chunk has been found. */ - sctp_ssn_next(in, sid); + sctp_ssn_next(stream, in, sid); /* Go find any other chunks that were waiting for * ordering. @@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_ulpevent *event; - struct sctp_stream *in; + struct sctp_stream *stream; struct sk_buff_head temp; struct sk_buff_head *lobby = &ulpq->lobby; __u16 csid, cssn; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* We are holding the chunks by stream, by SSN. */ skb_queue_head_init(&temp); @@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) continue; /* see if this ssn has been marked by skipping */ - if (!SSN_lt(cssn, sctp_ssn_peek(in, csid))) + if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); @@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) csid = cevent->stream; cssn = cevent->ssn; - if (csid == sid && cssn == sctp_ssn_peek(in, csid)) { - sctp_ssn_next(in, csid); + if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) { + sctp_ssn_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); @@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) */ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) { - struct sctp_stream *in; + struct sctp_stream *stream; /* Note: The stream ID must be verified before this routine. */ - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* Is this an old SSN? If so ignore. */ - if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) + if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid))) return; /* Mark that we are no longer expecting this SSN or lower. */ - sctp_ssn_skip(in, sid, ssn); + sctp_ssn_skip(stream, in, sid, ssn); /* Go find any other chunks that were waiting for * ordering and deliver them if needed. diff --git a/net/smc/Kconfig b/net/smc/Kconfig new file mode 100644 index 000000000000..c717ef0896aa --- /dev/null +++ b/net/smc/Kconfig @@ -0,0 +1,20 @@ +config SMC + tristate "SMC socket protocol family" + depends on INET && INFINIBAND + ---help--- + SMC-R provides a "sockets over RDMA" solution making use of + RDMA over Converged Ethernet (RoCE) technology to upgrade + AF_INET TCP connections transparently. + The Linux implementation of the SMC-R solution is designed as + a separate socket family SMC. + + Select this option if you want to run SMC socket applications + +config SMC_DIAG + tristate "SMC: socket monitoring interface" + depends on SMC + ---help--- + Support for SMC socket monitoring interface used by tools such as + smcss. + + if unsure, say Y. diff --git a/net/smc/Makefile b/net/smc/Makefile new file mode 100644 index 000000000000..188104654b54 --- /dev/null +++ b/net/smc/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_SMC) += smc.o +obj-$(CONFIG_SMC_DIAG) += smc_diag.o +smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c new file mode 100644 index 000000000000..5d4208ad029e --- /dev/null +++ b/net/smc/af_smc.c @@ -0,0 +1,1407 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * AF_SMC protocol family socket handler keeping the AF_INET sock address type + * applies to SOCK_STREAM sockets only + * offers an alternative communication option for TCP-protocol sockets + * applicable with RoCE-cards only + * + * Initial restrictions: + * - non-blocking connect postponed + * - IPv6 support postponed + * - support for alternate links postponed + * - partial support for non-blocking sockets only + * - support for urgent data postponed + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + * based on prototype from Frank Blaschka + */ + +#define KMSG_COMPONENT "smc" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/inetdevice.h> +#include <linux/workqueue.h> +#include <linux/in.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/smc.h> + +#include "smc.h" +#include "smc_clc.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_pnet.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" + +static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group + * creation + */ + +struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), +}; + +static void smc_tcp_listen_work(struct work_struct *); + +static void smc_set_keepalive(struct sock *sk, int val) +{ + struct smc_sock *smc = smc_sk(sk); + + smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); +} + +static struct smc_hashinfo smc_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), +}; + +int smc_hash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + struct hlist_head *head; + + head = &h->ht; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(smc_hash_sk); + +void smc_unhash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(smc_unhash_sk); + +struct proto smc_proto = { + .name = "SMC", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_DESTROY_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto); + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + smc = smc_sk(sk); + sock_hold(sk); + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + if (smc->use_fallback) { + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } else { + rc = smc_close_active(smc); + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } + if (smc->clcsock) { + sock_release(smc->clcsock); + smc->clcsock = NULL; + } + + /* detach socket */ + sock_orphan(sk); + sock->sk = NULL; + if (smc->use_fallback) { + schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); + } else if (sk->sk_state == SMC_CLOSED) { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } + sk->sk_prot->unhash(sk); + release_sock(sk); + + sock_put(sk); +out: + return rc; +} + +static void smc_destruct(struct sock *sk) +{ + if (sk->sk_state != SMC_CLOSED) + return; + if (!sock_flag(sk, SOCK_DEAD)) + return; + + sk_refcnt_debug_dec(sk); +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +{ + struct smc_sock *smc; + struct sock *sk; + + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + sk->sk_state = SMC_INIT; + sk->sk_destruct = smc_destruct; + sk->sk_protocol = SMCPROTO_SMC; + smc = smc_sk(sk); + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_LIST_HEAD(&smc->accept_q); + spin_lock_init(&smc->accept_q_lock); + INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); + sk->sk_prot->hash(sk); + sk_refcnt_debug_inc(sk); + + return sk; +} + +static int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + + /* replicate tests from inet_bind(), to be safe wrt. future changes */ + rc = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + rc = -EAFNOSUPPORT; + /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ + if ((addr->sin_family != AF_INET) && + ((addr->sin_family != AF_UNSPEC) || + (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + goto out; + + lock_sock(sk); + + /* Check if socket is already active */ + rc = -EINVAL; + if (sk->sk_state != SMC_INIT) + goto out_rel; + + smc->clcsock->sk->sk_reuse = sk->sk_reuse; + rc = kernel_bind(smc->clcsock, uaddr, addr_len); + +out_rel: + release_sock(sk); +out: + return rc; +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndbuf = osk->sk_sndbuf; + nsk->sk_rcvbuf = osk->sk_rcvbuf; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = osk->sk_mark; + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; +} + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED)) +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ +static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) +{ + smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); +} + +#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_DBG)) +/* copy only settings and flags relevant for smc from clc to smc socket */ +static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) +{ + smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); +} + +/* determine subnet and mask of internal TCP socket */ +int smc_netinfo_by_tcpsk(struct socket *clcsock, + __be32 *subnet, u8 *prefix_len) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct sockaddr_in addr; + int rc = -ENOENT; + int len; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + /* get address to which the internal TCP socket is bound */ + kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); + /* analyze IPv4 specific data of net_device belonging to TCP socket */ + for_ifa(dst->dev->ip_ptr) { + if (ifa->ifa_address != addr.sin_addr.s_addr) + continue; + *prefix_len = inet_mask_len(ifa->ifa_mask); + *subnet = ifa->ifa_address & ifa->ifa_mask; + rc = 0; + break; + } endfor_ifa(dst->dev->ip_ptr); + +out_rel: + dst_release(dst); +out: + return rc; +} + +static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) +{ + struct smc_link_group *lgr = smc->conn.lgr; + struct smc_link *link; + int rest; + int rc; + + link = &lgr->lnk[SMC_SINGLE_LINK]; + /* receive CONFIRM LINK request from server over RoCE fabric */ + rest = wait_for_completion_interruptible_timeout( + &link->llc_confirm, + SMC_LLC_WAIT_FIRST_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + rc = smc_ib_modify_qp_rts(link); + if (rc) + return SMC_CLC_DECL_INTERR; + + smc_wr_remember_qp_attr(link); + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link, + link->smcibdev->mac[link->ibport - 1], + gid, SMC_LLC_RESP); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + return rc; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + smc->conn.peer_conn_idx = clc->conn_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); + smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); +} + +static void smc_link_save_peer_info(struct smc_link *link, + struct smc_clc_msg_accept_confirm *clc) +{ + link->peer_qpn = ntoh24(clc->qpn); + memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); + memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->psn); + link->peer_mtu = clc->qp_mtu; +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc) +{ + struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; + struct smc_clc_msg_accept_confirm aclc; + int local_contact = SMC_FIRST_CONTACT; + struct smc_ib_device *smcibdev; + struct smc_link *link; + u8 srv_first_contact; + int reason_code = 0; + int rc = 0; + u8 ibport; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* do inband token exchange */ + reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) /* configuration error */ + goto decline_rdma; + /* receive SMC Accept CLC message */ + reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), + SMC_CLC_ACCEPT); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) + goto decline_rdma; + + srv_first_contact = aclc.hdr.flag; + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, + ibport, &aclc.lcl, srv_first_contact); + if (local_contact < 0) { + rc = local_contact; + if (rc == -ENOMEM) + reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ + else if (rc == -ENOLINK) + reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ + goto decline_rdma_unlock; + } + link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + + smc_conn_save_peer_info(smc, &aclc); + + rc = smc_sndbuf_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + rc = smc_rmb_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + + if (local_contact == SMC_FIRST_CONTACT) + smc_link_save_peer_info(link, &aclc); + + rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma_unlock; + } + + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_ib_ready_link(link); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma_unlock; + } + } + + rc = smc_clc_send_confirm(smc); + if (rc) + goto out_err_unlock; + + if (local_contact == SMC_FIRST_CONTACT) { + /* QP confirmation over RoCE fabric */ + reason_code = smc_clnt_conf_first_link( + smc, &smcibdev->gid[ibport - 1]); + if (reason_code < 0) { + rc = reason_code; + goto out_err_unlock; + } + if (reason_code > 0) + goto decline_rdma_unlock; + } + + mutex_unlock(&smc_create_lgr_pending); + smc_tx_init(smc); + smc_rx_init(smc); + +out_connected: + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return rc ? rc : local_contact; + +decline_rdma_unlock: + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err_unlock: + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); +out_err: + return rc; +} + +static int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + + smc = smc_sk(sk); + + /* separate smc parameter checking to be safe */ + if (alen < sizeof(addr->sa_family)) + goto out_err; + if (addr->sa_family != AF_INET) + goto out_err; + smc->addr = addr; /* needed for nonblocking connect */ + + lock_sock(sk); + switch (sk->sk_state) { + default: + goto out; + case SMC_ACTIVE: + rc = -EISCONN; + goto out; + case SMC_INIT: + rc = 0; + break; + } + + smc_copy_sock_settings_to_clc(smc); + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc) + goto out; + + /* setup RDMA connection */ + rc = smc_connect_rdma(smc); + if (rc < 0) + goto out; + else + rc = 0; /* success cases including fallback */ + +out: + release_sock(sk); +out_err: + return rc; +} + +static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) +{ + struct sock *sk = &lsmc->sk; + struct socket *new_clcsock; + struct sock *new_sk; + int rc; + + release_sock(&lsmc->sk); + new_sk = smc_sock_alloc(sock_net(sk), NULL); + if (!new_sk) { + rc = -ENOMEM; + lsmc->sk.sk_err = ENOMEM; + *new_smc = NULL; + lock_sock(&lsmc->sk); + goto out; + } + *new_smc = smc_sk(new_sk); + + rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + lock_sock(&lsmc->sk); + if (rc < 0) { + lsmc->sk.sk_err = -rc; + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + if (lsmc->sk.sk_state == SMC_CLOSED) { + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + + (*new_smc)->clcsock = new_clcsock; +out: + return rc; +} + +/* add a just created sock to the accept queue of the listen sock as + * candidate for a following socket accept call from user space + */ +static void smc_accept_enqueue(struct sock *parent, struct sock *sk) +{ + struct smc_sock *par = smc_sk(parent); + + sock_hold(sk); + spin_lock(&par->accept_q_lock); + list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); +} + +/* remove a socket from the accept queue of its parental listening socket */ +static void smc_accept_unlink(struct sock *sk) +{ + struct smc_sock *par = smc_sk(sk)->listen_smc; + + spin_lock(&par->accept_q_lock); + list_del_init(&smc_sk(sk)->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + sock_put(sk); +} + +/* remove a sock from the accept queue to bind it to a new socket created + * for a socket accept call from user space + */ +struct sock *smc_accept_dequeue(struct sock *parent, + struct socket *new_sock) +{ + struct smc_sock *isk, *n; + struct sock *new_sk; + + list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { + new_sk = (struct sock *)isk; + + smc_accept_unlink(new_sk); + if (new_sk->sk_state == SMC_CLOSED) { + /* tbd in follow-on patch: close this sock */ + continue; + } + if (new_sock) + sock_graft(new_sk, new_sock); + return new_sk; + } + return NULL; +} + +/* clean up for a created but never accepted sock */ +void smc_close_non_accepted(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + sock_hold(sk); + lock_sock(sk); + if (!sk->sk_lingertime) + /* wait for peer closing */ + sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; + if (!smc->use_fallback) + smc_close_active(smc); + if (smc->clcsock) { + struct socket *tcp; + + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + if (smc->use_fallback) { + schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); + } else { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } + release_sock(sk); + sock_put(sk); +} + +static int smc_serv_conf_first_link(struct smc_sock *smc) +{ + struct smc_link_group *lgr = smc->conn.lgr; + struct smc_link *link; + int rest; + int rc; + + link = &lgr->lnk[SMC_SINGLE_LINK]; + /* send CONFIRM LINK request to client over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + /* receive CONFIRM LINK response from client over the RoCE fabric */ + rest = wait_for_completion_interruptible_timeout( + &link->llc_confirm_resp, + SMC_LLC_WAIT_FIRST_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + } + + return rc; +} + +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_sock *lsmc = new_smc->listen_smc; + struct smc_clc_msg_accept_confirm cclc; + int local_contact = SMC_REUSE_CONTACT; + struct sock *newsmcsk = &new_smc->sk; + struct smc_clc_msg_proposal pclc; + struct smc_ib_device *smcibdev; + struct sockaddr_in peeraddr; + struct smc_link *link; + int reason_code = 0; + int rc = 0, len; + __be32 subnet; + u8 prefix_len; + u8 ibport; + + /* do inband token exchange - + *wait for and receive SMC Proposal CLC message + */ + reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + SMC_CLC_PROPOSAL); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); + if (rc) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + if ((pclc.outgoing_subnet != subnet) || + (pclc.prefix_len != prefix_len)) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* get address of the peer connected to the internal TCP socket */ + kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); + + /* allocate connection / link group */ + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, + smcibdev, ibport, &pclc.lcl, 0); + if (local_contact == SMC_REUSE_CONTACT) + /* lock no longer needed, free it due to following + * smc_clc_wait_msg() call + */ + mutex_unlock(&smc_create_lgr_pending); + if (local_contact < 0) { + rc = local_contact; + if (rc == -ENOMEM) + reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ + else if (rc == -ENOLINK) + reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ + goto decline_rdma; + } + link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + + rc = smc_sndbuf_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } + rc = smc_rmb_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } + + rc = smc_clc_send_accept(new_smc, local_contact); + if (rc) + goto out_err; + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + smc_conn_save_peer_info(new_smc, &cclc); + if (local_contact == SMC_FIRST_CONTACT) + smc_link_save_peer_info(link, &cclc); + + rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma; + } + + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_ib_ready_link(link); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma; + } + /* QP confirmation over RoCE fabric */ + reason_code = smc_serv_conf_first_link(new_smc); + if (reason_code < 0) { + /* peer is not aware of a problem */ + rc = reason_code; + goto out_err; + } + if (reason_code > 0) + goto decline_rdma; + } + + smc_tx_init(new_smc); + smc_rx_init(new_smc); + +out_connected: + sk_refcnt_debug_inc(newsmcsk); + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; +enqueue: + if (local_contact == SMC_FIRST_CONTACT) + mutex_unlock(&smc_create_lgr_pending); + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); + } + release_sock(&lsmc->sk); + + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ + return; + +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + smc_conn_free(&new_smc->conn); + new_smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(new_smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err: + newsmcsk->sk_state = SMC_CLOSED; + smc_conn_free(&new_smc->conn); + goto enqueue; /* queue new sock with sk_err set */ +} + +static void smc_tcp_listen_work(struct work_struct *work) +{ + struct smc_sock *lsmc = container_of(work, struct smc_sock, + tcp_listen_work); + struct smc_sock *new_smc; + int rc = 0; + + lock_sock(&lsmc->sk); + while (lsmc->sk.sk_state == SMC_LISTEN) { + rc = smc_clcsock_accept(lsmc, &new_smc); + if (rc) + goto out; + if (!new_smc) + continue; + + new_smc->listen_smc = lsmc; + new_smc->use_fallback = false; /* assume rdma capability first*/ + sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */ + INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); + smc_copy_sock_settings_to_smc(new_smc); + schedule_work(&new_smc->smc_listen_work); + } + +out: + release_sock(&lsmc->sk); + lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */ +} + +static int smc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + lock_sock(sk); + + rc = -EINVAL; + if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) + goto out; + + rc = 0; + if (sk->sk_state == SMC_LISTEN) { + sk->sk_max_ack_backlog = backlog; + goto out; + } + /* some socket options are handled in core, so we could not apply + * them to the clc socket -- copy smc socket options to clc socket + */ + smc_copy_sock_settings_to_clc(smc); + + rc = kernel_listen(smc->clcsock, backlog); + if (rc) + goto out; + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = SMC_LISTEN; + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + schedule_work(&smc->tcp_listen_work); + +out: + release_sock(sk); + return rc; +} + +static int smc_accept(struct socket *sock, struct socket *new_sock, + int flags) +{ + struct sock *sk = sock->sk, *nsk; + DECLARE_WAITQUEUE(wait, current); + struct smc_sock *lsmc; + long timeo; + int rc = 0; + + lsmc = smc_sk(sk); + lock_sock(sk); + + if (lsmc->sk.sk_state != SMC_LISTEN) { + rc = -EINVAL; + goto out; + } + + /* Wait for an incoming connection */ + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (!(nsk = smc_accept_dequeue(sk, new_sock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + rc = -EAGAIN; + break; + } + release_sock(sk); + timeo = schedule_timeout(timeo); + /* wakeup by sk_data_ready in smc_listen_work() */ + sched_annotate_sleep(); + lock_sock(sk); + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (!rc) + rc = sock_error(nsk); + +out: + release_sock(sk); + return rc; +} + +static int smc_getname(struct socket *sock, struct sockaddr *addr, + int *len, int peer) +{ + struct smc_sock *smc; + + if (peer && (sock->sk->sk_state != SMC_ACTIVE) && + (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) + return -ENOTCONN; + + smc = smc_sk(sock->sk); + + return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); +} + +static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) + goto out; + if (smc->use_fallback) + rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); + else + rc = smc_tx_sendmsg(smc, msg, len); +out: + release_sock(sk); + return rc; +} + +static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state == SMC_INIT) || + (sk->sk_state == SMC_LISTEN) || + (sk->sk_state == SMC_CLOSED)) + goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; + goto out; + } + + if (smc->use_fallback) + rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); + else + rc = smc_rx_recvmsg(smc, msg, len, flags); + +out: + release_sock(sk); + return rc; +} + +static unsigned int smc_accept_poll(struct sock *parent) +{ + struct smc_sock *isk; + struct sock *sk; + + lock_sock(parent); + list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { + sk = (struct sock *)isk; + + if (sk->sk_state == SMC_ACTIVE) { + release_sock(parent); + return POLLIN | POLLRDNORM; + } + } + release_sock(parent); + + return 0; +} + +static unsigned int smc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sock->sk); + if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + /* delegate to CLC child sock */ + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + /* if non-blocking connect finished ... */ + lock_sock(sk); + if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { + sk->sk_err = smc->clcsock->sk->sk_err; + if (sk->sk_err) { + mask |= POLLERR; + } else { + rc = smc_connect_rdma(smc); + if (rc < 0) + mask |= POLLERR; + else + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; + } + } + release_sock(sk); + } else { + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == SMC_LISTEN) + /* woken up by sk_data_ready in smc_listen_work() */ + mask |= smc_accept_poll(sk); + if (sk->sk_err) + mask |= POLLERR; + if (atomic_read(&smc->conn.sndbuf_space) || + (sk->sk_shutdown & SEND_SHUTDOWN)) { + mask |= POLLOUT | POLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= POLLIN | POLLRDNORM; + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1) + mask |= POLLIN; + + } + + return mask; +} + +static int smc_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + int rc1 = 0; + + smc = smc_sk(sk); + + if ((how < SHUT_RD) || (how > SHUT_RDWR)) + return rc; + + lock_sock(sk); + + rc = -ENOTCONN; + if ((sk->sk_state != SMC_LISTEN) && + (sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_PEERCLOSEWAIT1) && + (sk->sk_state != SMC_PEERCLOSEWAIT2) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_APPCLOSEWAIT2) && + (sk->sk_state != SMC_APPFINCLOSEWAIT)) + goto out; + if (smc->use_fallback) { + rc = kernel_sock_shutdown(smc->clcsock, how); + sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; + if (sk->sk_shutdown == SHUTDOWN_MASK) + sk->sk_state = SMC_CLOSED; + goto out; + } + switch (how) { + case SHUT_RDWR: /* shutdown in both directions */ + rc = smc_close_active(smc); + break; + case SHUT_WR: + rc = smc_close_shutdown_write(smc); + break; + case SHUT_RD: + if (sk->sk_state == SMC_LISTEN) + rc = smc_close_active(smc); + else + rc = 0; + /* nothing more to do because peer is not involved */ + break; + } + rc1 = kernel_sock_shutdown(smc->clcsock, how); + /* map sock_shutdown_cmd constants to sk_shutdown value range */ + sk->sk_shutdown |= how + 1; + +out: + release_sock(sk); + return rc ? rc : rc1; +} + +static int smc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + + smc = smc_sk(sk); + + /* generic setsockopts reaching us here always apply to the + * CLC socket + */ + return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + /* socket options apply to the CLC socket */ + return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + if (smc->use_fallback) + return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); + else + return sock_no_ioctl(sock, cmd, arg); +} + +static ssize_t smc_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto out; + if (smc->use_fallback) + rc = kernel_sendpage(smc->clcsock, page, offset, + size, flags); + else + rc = sock_no_sendpage(sock, page, offset, size, flags); + +out: + release_sock(sk); + return rc; +} + +static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + goto out; + if (smc->use_fallback) { + rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, + pipe, len, flags); + } else { + rc = -EOPNOTSUPP; + } +out: + release_sock(sk); + return rc; +} + +/* must look like tcp */ +static const struct proto_ops smc_sock_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = smc_sendpage, + .splice_read = smc_splice_read, +}; + +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct smc_sock *smc; + struct sock *sk; + int rc; + + rc = -ESOCKTNOSUPPORT; + if (sock->type != SOCK_STREAM) + goto out; + + rc = -EPROTONOSUPPORT; + if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + goto out; + + rc = -ENOBUFS; + sock->ops = &smc_sock_ops; + sk = smc_sock_alloc(net, sock); + if (!sk) + goto out; + + /* create internal TCP socket for CLC handshake and fallback */ + smc = smc_sk(sk); + smc->use_fallback = false; /* assume rdma capability first */ + rc = sock_create_kern(net, PF_INET, SOCK_STREAM, + IPPROTO_TCP, &smc->clcsock); + if (rc) + sk_common_release(sk); + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + +out: + return rc; +} + +static const struct net_proto_family smc_sock_family_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .create = smc_create, +}; + +static int __init smc_init(void) +{ + int rc; + + rc = smc_pnet_init(); + if (rc) + return rc; + + rc = smc_llc_init(); + if (rc) { + pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = smc_cdc_init(); + if (rc) { + pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = proto_register(&smc_proto, 1); + if (rc) { + pr_err("%s: proto_register fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = sock_register(&smc_sock_family_ops); + if (rc) { + pr_err("%s: sock_register fails with %d\n", __func__, rc); + goto out_proto; + } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + + rc = smc_ib_register_client(); + if (rc) { + pr_err("%s: ib_register fails with %d\n", __func__, rc); + goto out_sock; + } + + return 0; + +out_sock: + sock_unregister(PF_SMC); +out_proto: + proto_unregister(&smc_proto); +out_pnet: + smc_pnet_exit(); + return rc; +} + +static void __exit smc_exit(void) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_freeing_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!list_empty(&smc_lgr_list.list)) + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + spin_unlock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { + list_del_init(&lgr->list); + smc_lgr_free(lgr); /* free link group */ + } + smc_ib_unregister_client(); + sock_unregister(PF_SMC); + proto_unregister(&smc_proto); + smc_pnet_exit(); +} + +module_init(smc_init); +module_exit(smc_exit); + +MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("smc socket address family"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h new file mode 100644 index 000000000000..ee5fbea24549 --- /dev/null +++ b/net/smc/smc.h @@ -0,0 +1,274 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ +#ifndef __SMC_H +#define __SMC_H + +#include <linux/socket.h> +#include <linux/types.h> +#include <linux/compiler.h> /* __aligned */ +#include <net/sock.h> + +#include "smc_ib.h" + +#define SMCPROTO_SMC 0 /* SMC protocol */ + +#define SMC_MAX_PORTS 2 /* Max # of ports */ + +extern struct proto smc_proto; + +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + +enum smc_state { /* possible states of an SMC socket */ + SMC_ACTIVE = 1, + SMC_INIT = 2, + SMC_CLOSED = 7, + SMC_LISTEN = 10, + /* normal close */ + SMC_PEERCLOSEWAIT1 = 20, + SMC_PEERCLOSEWAIT2 = 21, + SMC_APPFINCLOSEWAIT = 24, + SMC_APPCLOSEWAIT1 = 22, + SMC_APPCLOSEWAIT2 = 23, + SMC_PEERFINCLOSEWAIT = 25, + /* abnormal close */ + SMC_PEERABORTWAIT = 26, + SMC_PROCESSABORT = 27, +}; + +struct smc_link_group; + +struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ + u8 type; +} __aligned(1); + +struct smc_cdc_conn_state_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 peer_done_writing : 1; /* Sending done indicator */ + u8 peer_conn_closed : 1; /* Peer connection closed indicator */ + u8 peer_conn_abort : 1; /* Abnormal close indicator */ + u8 reserved : 5; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 5; + u8 peer_conn_abort : 1; + u8 peer_conn_closed : 1; + u8 peer_done_writing : 1; +#endif +}; + +struct smc_cdc_producer_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ + u8 urg_data_pending : 1; /* Urgent Data Pending */ + u8 urg_data_present : 1; /* Urgent Data Present */ + u8 cons_curs_upd_req : 1; /* cursor update requested */ + u8 failover_validation : 1;/* message replay due to failover */ + u8 reserved : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 3; + u8 failover_validation : 1; + u8 cons_curs_upd_req : 1; + u8 urg_data_present : 1; + u8 urg_data_pending : 1; + u8 write_blocked : 1; +#endif +}; + +/* in host byte order */ +union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ + struct { + u16 reserved; + u16 wrap; /* window wrap sequence number */ + u32 count; /* cursor (= offset) part */ + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in host byte order, except for flag bitfields in network byte order */ +struct smc_host_cdc_msg { /* Connection Data Control message */ + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* length = 44 */ + u16 seqno; /* connection seq # */ + u32 token; /* alert_token */ + union smc_host_cursor prod; /* producer cursor */ + union smc_host_cursor cons; /* consumer cursor, + * piggy backed "ack" + */ + struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ + struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ + u8 reserved[18]; +} __aligned(8); + +struct smc_connection { + struct rb_node alert_node; + struct smc_link_group *lgr; /* link group of connection */ + u32 alert_token_local; /* unique conn. id */ + u8 peer_conn_idx; /* from tcp handshake */ + int peer_rmbe_size; /* size of peer rx buffer */ + atomic_t peer_rmbe_space;/* remaining free bytes in peer + * rmbe + */ + int rtoken_idx; /* idx to peer RMB rkey/addr */ + + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ + int sndbuf_size; /* sndbuf size <== sock wmem */ + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ + int rmbe_size; /* RMBE size <== sock rmem */ + int rmbe_size_short;/* compressed notation */ + int rmbe_update_limit; + /* lower limit for consumer + * cursor update + */ + + struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging + * buffer for CDC msg send + * .prod cf. TCP snd_nxt + * .cons cf. TCP sends ack + */ + union smc_host_cursor tx_curs_prep; /* tx - prepared data + * snd_max..wmem_alloc + */ + union smc_host_cursor tx_curs_sent; /* tx - sent data + * snd_nxt ? + */ + union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer + * snd-wnd-begin ? + */ + atomic_t sndbuf_space; /* remaining space in sndbuf */ + u16 tx_cdc_seq; /* sequence # for CDC send */ + spinlock_t send_lock; /* protect wr_sends */ + struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + + struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. + * .prod cf. TCP rcv_nxt + * .cons cf. TCP snd_una + */ + union smc_host_cursor rx_curs_confirmed; /* confirmed to peer + * source of snd_una ? + */ + atomic_t bytes_to_rcv; /* arrived data, + * not yet received + */ +#ifndef KERNEL_HAS_ATOMIC64 + spinlock_t acurs_lock; /* protect cursors */ +#endif +}; + +struct smc_sock { /* smc sock container */ + struct sock sk; + struct socket *clcsock; /* internal tcp socket */ + struct smc_connection conn; /* smc connection */ + struct sockaddr *addr; /* inet connect address */ + struct smc_sock *listen_smc; /* listen parent */ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct work_struct smc_listen_work;/* prepare new accept socket */ + struct list_head accept_q; /* sockets to be accepted */ + spinlock_t accept_q_lock; /* protects accept_q */ + struct delayed_work sock_put_work; /* final socket freeing */ + bool use_fallback; /* fallback to tcp */ + u8 wait_close_tx_prepared : 1; + /* shutdown wr or close + * started, waiting for unsent + * data to be sent + */ +}; + +static inline struct smc_sock *smc_sk(const struct sock *sk) +{ + return (struct smc_sock *)sk; +} + +#define SMC_SYSTEMID_LEN 8 + +extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ + +/* convert an u32 value into network byte order, store it into a 3 byte field */ +static inline void hton24(u8 *net, u32 host) +{ + __be32 t; + + t = cpu_to_be32(host); + memcpy(net, ((u8 *)&t) + 1, 3); +} + +/* convert a received 3 byte field into host byte order*/ +static inline u32 ntoh24(u8 *net) +{ + __be32 t = 0; + + memcpy(((u8 *)&t) + 1, net, 3); + return be32_to_cpu(t); +} + +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ + +#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ + +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static inline u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +static inline int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + +#ifdef CONFIG_XFRM +static inline bool using_ipsec(struct smc_sock *smc) +{ + return (smc->clcsock->sk->sk_policy[0] || + smc->clcsock->sk->sk_policy[1]) ? 1 : 0; +} +#else +static inline bool using_ipsec(struct smc_sock *smc) +{ + return 0; +} +#endif + +struct smc_clc_msg_local; + +int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, + u8 *prefix_len); +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact); +struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); +void smc_close_non_accepted(struct sock *sk); + +#endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c new file mode 100644 index 000000000000..5a339493872e --- /dev/null +++ b/net/smc/smc_cdc.c @@ -0,0 +1,304 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * handles flow control + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/spinlock.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" + +/********************************** send *************************************/ + +struct smc_cdc_tx_pend { + struct smc_connection *conn; /* socket connection */ + union smc_host_cursor cursor; /* tx sndbuf cursor sent */ + union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ + u16 ctrl_seq; /* conn. tx sequence # */ +}; + +/* handler for send/transmission completion of a CDC msg */ +static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_sock *smc; + int diff; + + if (!cdcpend->conn) + /* already dismissed */ + return; + + smc = container_of(cdcpend->conn, struct smc_sock, conn); + bh_lock_sock(&smc->sk); + if (!wc_status) { + diff = smc_curs_diff(cdcpend->conn->sndbuf_size, + &cdcpend->conn->tx_curs_fin, + &cdcpend->cursor); + /* sndbuf_space is decreased in smc_sendmsg */ + smp_mb__before_atomic(); + atomic_add(diff, &cdcpend->conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + smp_mb__after_atomic(); + smc_curs_write(&cdcpend->conn->tx_curs_fin, + smc_curs_read(&cdcpend->cursor, cdcpend->conn), + cdcpend->conn); + } + smc_tx_sndbuf_nonfull(smc); + if (smc->sk.sk_state != SMC_ACTIVE) + /* wake up smc_close_wait_tx_pends() */ + smc->sk.sk_state_change(&smc->sk); + bh_unlock_sock(&smc->sk); +} + +int smc_cdc_get_free_slot(struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_cdc_tx_pend **pend) +{ + return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, + (struct smc_wr_tx_pend_priv **)pend); +} + +static inline void smc_cdc_add_pending_send(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend) +{ + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); + BUILD_BUG_ON_MSG( + offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)"); + pend->conn = conn; + pend->cursor = conn->tx_curs_sent; + pend->p_cursor = conn->local_tx_ctrl.prod; + pend->ctrl_seq = conn->tx_cdc_seq; +} + +int smc_cdc_msg_send(struct smc_connection *conn, + struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend) +{ + struct smc_link *link; + int rc; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + smc_cdc_add_pending_send(conn, pend); + + conn->tx_cdc_seq++; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, + &conn->local_tx_ctrl, conn); + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (!rc) + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + + return rc; +} + +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, + &pend); + if (rc) + return rc; + + return smc_cdc_msg_send(conn, wr_buf, pend); +} + +static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, + unsigned long data) +{ + struct smc_connection *conn = (struct smc_connection *)data; + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + return cdc_pend->conn == conn; +} + +static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) +{ + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + cdc_pend->conn = NULL; +} + +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) +{ + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, + smc_cdc_tx_filter, smc_cdc_tx_dismisser, + (unsigned long)conn); +} + +bool smc_cdc_tx_has_pending(struct smc_connection *conn) +{ + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE, + smc_cdc_tx_filter, (unsigned long)conn); +} + +/********************************* receive ***********************************/ + +static inline bool smc_cdc_before(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) < 0; +} + +static void smc_cdc_msg_recv_action(struct smc_sock *smc, + struct smc_link *link, + struct smc_cdc_msg *cdc) +{ + union smc_host_cursor cons_old, prod_old; + struct smc_connection *conn = &smc->conn; + int diff_cons, diff_prod; + + if (!cdc->prod_flags.failover_validation) { + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + } + smc_curs_write(&prod_old, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + smc_curs_write(&cons_old, + smc_curs_read(&conn->local_rx_ctrl.cons, conn), + conn); + smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); + + diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, + &conn->local_rx_ctrl.cons); + if (diff_cons) { + /* peer_rmbe_space is decreased during data transfer with RDMA + * write + */ + smp_mb__before_atomic(); + atomic_add(diff_cons, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + } + + diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, + &conn->local_rx_ctrl.prod); + if (diff_prod) { + /* bytes_to_rcv is decreased in smc_recvmsg */ + smp_mb__before_atomic(); + atomic_add(diff_prod, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + smp_mb__after_atomic(); + smc->sk.sk_data_ready(&smc->sk); + } + + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + smc->sk.sk_err = ECONNRESET; + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + } + if (smc_cdc_rxed_any_close_or_senddone(conn)) + smc_close_passive_received(smc); + + /* piggy backed tx info */ + /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if (diff_cons && smc_tx_prepared_sends(conn)) { + smc_tx_sndbuf_nonempty(conn); + /* trigger socket release if connection closed */ + smc_close_wake_tx_prepared(smc); + } + + /* subsequent patch: trigger socket release if connection closed */ + + /* socket connected but not accepted */ + if (!smc->sk.sk_socket) + return; + + /* data available */ + if ((conn->local_rx_ctrl.prod_flags.write_blocked) || + (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) + smc_tx_consumer_update(conn); +} + +/* called under tasklet context */ +static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, + struct smc_link *link, u64 wr_id) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + struct smc_connection *connection; + struct smc_sock *smc; + + /* lookup connection */ + read_lock_bh(&lgr->conns_lock); + connection = smc_lgr_find_conn(ntohl(cdc->token), lgr); + if (!connection) { + read_unlock_bh(&lgr->conns_lock); + return; + } + smc = container_of(connection, struct smc_sock, conn); + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + bh_lock_sock(&smc->sk); + smc_cdc_msg_recv_action(smc, link, cdc); + bh_unlock_sock(&smc->sk); + sock_put(&smc->sk); /* no free sk in softirq-context */ +} + +/***************************** init, exit, misc ******************************/ + +static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_cdc_msg *cdc = buf; + + if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) + return; /* short message */ + if (cdc->len != sizeof(*cdc)) + return; /* invalid message */ + smc_cdc_msg_recv(cdc, link, wc->wr_id); +} + +static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { + { + .handler = smc_cdc_rx_handler, + .type = SMC_CDC_MSG_TYPE + }, + { + .handler = NULL, + } +}; + +int __init smc_cdc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_cdc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h new file mode 100644 index 000000000000..8e1d76f26007 --- /dev/null +++ b/net/smc/smc_cdc.h @@ -0,0 +1,218 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CDC_H +#define SMC_CDC_H + +#include <linux/kernel.h> /* max_t */ +#include <linux/atomic.h> +#include <linux/in.h> +#include <linux/compiler.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_wr.h" + +#define SMC_CDC_MSG_TYPE 0xFE + +/* in network byte order */ +union smc_cdc_cursor { /* SMC cursor */ + struct { + __be16 reserved; + __be16 wrap; + __be32 count; + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in network byte order */ +struct smc_cdc_msg { + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* 44 */ + __be16 seqno; + __be32 token; + union smc_cdc_cursor prod; + union smc_cdc_cursor cons; /* piggy backed "ack" */ + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 reserved[18]; +} __aligned(8); + +static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) +{ + return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || + conn->local_rx_ctrl.conn_state_flags.peer_conn_closed; +} + +static inline bool smc_cdc_rxed_any_close_or_senddone( + struct smc_connection *conn) +{ + return smc_cdc_rxed_any_close(conn) || + conn->local_rx_ctrl.conn_state_flags.peer_done_writing; +} + +static inline void smc_curs_add(int size, union smc_host_cursor *curs, + int value) +{ + curs->count += value; + if (curs->count >= size) { + curs->wrap++; + curs->count -= size; + } +} + +/* SMC cursors are 8 bytes long and require atomic reading and writing */ +static inline u64 smc_curs_read(union smc_host_cursor *curs, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&conn->acurs_lock, flags); + ret = curs->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); + return ret; +#else + return atomic64_read(&curs->acurs); +#endif +} + +static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&conn->acurs_lock, flags); + ret = curs->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); + return ret; +#else + return atomic64_read(&curs->acurs); +#endif +} + +static inline void smc_curs_write(union smc_host_cursor *curs, u64 val, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + curs->acurs = val; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&curs->acurs, val); +#endif +} + +static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + curs->acurs = val; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&curs->acurs, val); +#endif +} + +/* calculate cursor difference between old and new, where old <= new */ +static inline int smc_curs_diff(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap != new->wrap) + return max_t(int, 0, + ((size - old->count) + new->count)); + + return max_t(int, 0, (new->count - old->count)); +} + +static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, + union smc_host_cursor *local, + struct smc_connection *conn) +{ + union smc_host_cursor temp; + + smc_curs_write(&temp, smc_curs_read(local, conn), conn); + peer->count = htonl(temp.count); + peer->wrap = htons(temp.wrap); + /* peer->reserved = htons(0); must be ensured by caller */ +} + +static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer, + struct smc_host_cdc_msg *local, + struct smc_connection *conn) +{ + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(local->seqno); + peer->token = htonl(local->token); + smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn); + smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn); + peer->prod_flags = local->prod_flags; + peer->conn_state_flags = local->conn_state_flags; +} + +static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, + union smc_cdc_cursor *peer, + struct smc_connection *conn) +{ + union smc_host_cursor temp, old; + union smc_cdc_cursor net; + + smc_curs_write(&old, smc_curs_read(local, conn), conn); + smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn); + temp.count = ntohl(net.count); + temp.wrap = ntohs(net.wrap); + if ((old.wrap > temp.wrap) && temp.wrap) + return; + if ((old.wrap == temp.wrap) && + (old.count > temp.count)) + return; + smc_curs_write(local, smc_curs_read(&temp, conn), conn); +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + local->common.type = peer->common.type; + local->len = peer->len; + local->seqno = ntohs(peer->seqno); + local->token = ntohl(peer->token); + smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn); + smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn); + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +struct smc_cdc_tx_pend; + +int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, + struct smc_cdc_tx_pend **pend); +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); +int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend); +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +bool smc_cdc_tx_has_pending(struct smc_connection *conn); +int smc_cdc_init(void) __init; + +#endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c new file mode 100644 index 000000000000..cc6b6f8651eb --- /dev/null +++ b/net/smc/smc_clc.c @@ -0,0 +1,280 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/in.h> +#include <linux/if_ether.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_ib.h" + +/* Wait for data on the tcp-socket, analyze received data + * Returns: + * 0 if success and it was not a decline that we received. + * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. + * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. + */ +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type) +{ + struct sock *clc_sk = smc->clcsock->sk; + struct smc_clc_msg_hdr *clcm = buf; + struct msghdr msg = {NULL, 0}; + int reason_code = 0; + struct kvec vec; + int len, datlen; + int krflags; + + /* peek the first few bytes to determine length of data to receive + * so we don't consume any subsequent CLC message or payload data + * in the TCP byte stream + */ + vec.iov_base = buf; + vec.iov_len = buflen; + krflags = MSG_PEEK | MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_hdr), krflags); + if (signal_pending(current)) { + reason_code = -EINTR; + clc_sk->sk_err = EINTR; + smc->sk.sk_err = EINTR; + goto out; + } + if (clc_sk->sk_err) { + reason_code = -clc_sk->sk_err; + smc->sk.sk_err = clc_sk->sk_err; + goto out; + } + if (!len) { /* peer has performed orderly shutdown */ + smc->sk.sk_err = ECONNRESET; + reason_code = -ECONNRESET; + goto out; + } + if (len < 0) { + smc->sk.sk_err = -len; + reason_code = len; + goto out; + } + datlen = ntohs(clcm->length); + if ((len < sizeof(struct smc_clc_msg_hdr)) || + (datlen < sizeof(struct smc_clc_msg_decline)) || + (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || + memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + ((clcm->type != SMC_CLC_DECLINE) && + (clcm->type != expected_type))) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + + /* receive the complete CLC message */ + vec.iov_base = buf; + vec.iov_len = buflen; + memset(&msg, 0, sizeof(struct msghdr)); + krflags = MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); + if (len < datlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + if (clcm->type == SMC_CLC_DECLINE) { + reason_code = SMC_CLC_DECL_REPLY; + if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) + == SMC_CLC_DECL_SYNCERR) + smc->conn.lgr->sync_err = true; + } + +out: + return reason_code; +} + +/* send CLC DECLINE message across internal TCP socket */ +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync) +{ + struct smc_clc_msg_decline dclc; + struct msghdr msg; + struct kvec vec; + int len; + + memset(&dclc, 0, sizeof(dclc)); + memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + dclc.hdr.type = SMC_CLC_DECLINE; + dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); + dclc.hdr.version = SMC_CLC_V1; + dclc.hdr.flag = out_of_sync ? 1 : 0; + memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + dclc.peer_diagnosis = htonl(peer_diag_info); + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &dclc; + vec.iov_len = sizeof(struct smc_clc_msg_decline); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_decline)); + if (len < sizeof(struct smc_clc_msg_decline)) + smc->sk.sk_err = EPROTO; + if (len < 0) + smc->sk.sk_err = -len; + return len; +} + +/* send CLC PROPOSAL message across internal TCP socket */ +int smc_clc_send_proposal(struct smc_sock *smc, + struct smc_ib_device *smcibdev, + u8 ibport) +{ + struct smc_clc_msg_proposal pclc; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len, rc; + + /* send SMC Proposal CLC message */ + memset(&pclc, 0, sizeof(pclc)); + memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc.hdr.type = SMC_CLC_PROPOSAL; + pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, + &pclc.prefix_len); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &pclc; + vec.iov_len = sizeof(pclc); + /* due to the few bytes needed for clc-handshake this cannot block */ + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + if (len < sizeof(pclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + + return reason_code; +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_clc_msg_accept_confirm cclc; + struct smc_link *link; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + /* send SMC Confirm CLC msg */ + memset(&cclc, 0, sizeof(cclc)); + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + cclc.hdr.type = SMC_CLC_CONFIRM; + cclc.hdr.length = htons(sizeof(cclc)); + cclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); + hton24(cclc.qpn, link->roce_qp->qp_num); + cclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_alert_token = htonl(conn->alert_token_local); + cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); + cclc.rmbe_size = conn->rmbe_size_short; + cclc.rmb_dma_addr = + cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + hton24(cclc.psn, link->psn_initial); + + memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &cclc; + vec.iov_len = sizeof(cclc); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); + if (len < sizeof(cclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + return reason_code; +} + +/* send CLC ACCEPT message across internal TCP socket */ +int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) +{ + struct smc_connection *conn = &new_smc->conn; + struct smc_clc_msg_accept_confirm aclc; + struct smc_link *link; + struct msghdr msg; + struct kvec vec; + int rc = 0; + int len; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memset(&aclc, 0, sizeof(aclc)); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + aclc.hdr.type = SMC_CLC_ACCEPT; + aclc.hdr.length = htons(sizeof(aclc)); + aclc.hdr.version = SMC_CLC_V1; /* SMC version */ + if (srv_first_contact) + aclc.hdr.flag = 1; + memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); + hton24(aclc.qpn, link->roce_qp->qp_num); + aclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_alert_token = htonl(conn->alert_token_local); + aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = + cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + hton24(aclc.psn, link->psn_initial); + memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &aclc; + vec.iov_len = sizeof(aclc); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); + if (len < sizeof(aclc)) { + if (len >= 0) + new_smc->sk.sk_err = EPROTO; + else + new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; + rc = sock_error(&new_smc->sk); + } + + return rc; +} diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h new file mode 100644 index 000000000000..13db8ce177c9 --- /dev/null +++ b/net/smc/smc_clc.h @@ -0,0 +1,116 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CLC_H +#define _SMC_CLC_H + +#include <rdma/ib_verbs.h> + +#include "smc.h" + +#define SMC_CLC_PROPOSAL 0x01 +#define SMC_CLC_ACCEPT 0x02 +#define SMC_CLC_CONFIRM 0x03 +#define SMC_CLC_DECLINE 0x04 + +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; + +#define SMC_CLC_V1 0x1 /* SMC version */ +#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ +#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ +#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ +#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ +#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ +#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ +#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ +#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ +#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ + +struct smc_clc_msg_hdr { /* header1 of clc messages */ + u8 eyecatcher[4]; /* eye catcher */ + u8 type; /* proposal / accept / confirm / decline */ + __be16 length; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 version : 4, + flag : 1, + rsvd : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 rsvd : 3, + flag : 1, + version : 4; +#endif +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_trail { /* trailer of clc messages */ + u8 eyecatcher[4]; +}; + +struct smc_clc_msg_local { /* header2 of clc messages */ + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */ + u8 gid[16]; /* gid of ib_device port */ + u8 mac[6]; /* mac of ib_device port */ +}; + +struct smc_clc_msg_proposal { /* clc proposal message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ + __be32 outgoing_subnet; /* subnet mask */ + u8 prefix_len; /* number of significant bits in mask */ + u8 reserved[2]; + u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 conn_idx; /* Connection index, which RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ + qp_mtu : 4; /* QP mtu */ +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + rmbe_size : 4; +#endif + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* initial packet sequence number */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_decline { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ + u8 reserved2[4]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_sock; +struct smc_ib_device; + +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync); +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, + u8 ibport); +int smc_clc_send_confirm(struct smc_sock *smc); +int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); + +#endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c new file mode 100644 index 000000000000..03dfcc6b7661 --- /dev/null +++ b/net/smc/smc_close.c @@ -0,0 +1,442 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing - normal and abnormal + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_tx.h" +#include "smc_cdc.h" +#include "smc_close.h" + +#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ) + +static void smc_close_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + /* Close non-accepted connections */ + while ((sk = smc_accept_dequeue(parent, NULL))) + smc_close_non_accepted(sk); +} + +static void smc_close_wait_tx_pends(struct smc_sock *smc) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + signed long timeout; + + timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME; + add_wait_queue(sk_sleep(sk), &wait); + while (!signal_pending(current) && timeout) { + int rc; + + rc = sk_wait_event(sk, &timeout, + !smc_cdc_tx_has_pending(&smc->conn), + &wait); + if (rc) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); +} + +/* wait for sndbuf data being transmitted */ +static void smc_close_stream_wait(struct smc_sock *smc, long timeout) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + + if (!timeout) + return; + + if (!smc_tx_prepared_sends(&smc->conn)) + return; + + smc->wait_close_tx_prepared = 1; + add_wait_queue(sk_sleep(sk), &wait); + while (!signal_pending(current) && timeout) { + int rc; + + rc = sk_wait_event(sk, &timeout, + !smc_tx_prepared_sends(&smc->conn) || + (sk->sk_err == ECONNABORTED) || + (sk->sk_err == ECONNRESET), + &wait); + if (rc) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + smc->wait_close_tx_prepared = 0; +} + +void smc_close_wake_tx_prepared(struct smc_sock *smc) +{ + if (smc->wait_close_tx_prepared) + /* wake up socket closing */ + smc->sk.sk_state_change(&smc->sk); +} + +static int smc_close_wr(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static int smc_close_final(struct smc_connection *conn) +{ + if (atomic_read(&conn->bytes_to_rcv)) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static int smc_close_abort(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +/* terminate smc socket abnormally - active abort + * RDMA communication no longer possible + */ +void smc_close_active_abort(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + + bh_lock_sock(&smc->sk); + smc->sk.sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_err = ECONNABORTED; + smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + } + switch (smc->sk.sk_state) { + case SMC_INIT: + smc->sk.sk_state = SMC_PEERABORTWAIT; + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + if (!smc_cdc_rxed_any_close(&smc->conn)) + smc->sk.sk_state = SMC_PEERABORTWAIT; + else + smc->sk.sk_state = SMC_CLOSED; + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (!txflags->peer_conn_closed) { + smc->sk.sk_state = SMC_PEERABORTWAIT; + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + } else { + smc->sk.sk_state = SMC_CLOSED; + } + break; + case SMC_PROCESSABORT: + case SMC_APPFINCLOSEWAIT: + if (!txflags->peer_conn_closed) { + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + } + smc->sk.sk_state = SMC_CLOSED; + break; + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + case SMC_CLOSED: + break; + } + + sock_set_flag(&smc->sk, SOCK_DEAD); + bh_unlock_sock(&smc->sk); + smc->sk.sk_state_change(&smc->sk); +} + +int smc_close_active(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int old_state; + int rc = 0; + + if (sock_flag(sk, SOCK_LINGER) && + !(current->flags & PF_EXITING)) + timeout = sk->sk_lingertime; + +again: + old_state = sk->sk_state; + switch (old_state) { + case SMC_INIT: + sk->sk_state = SMC_CLOSED; + if (smc->smc_listen_work.func) + flush_work(&smc->smc_listen_work); + sock_put(sk); + break; + case SMC_LISTEN: + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); /* wake up accept */ + if (smc->clcsock && smc->clcsock->sk) { + rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + /* wake up kernel_accept of smc_tcp_listen_worker */ + smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); + } + release_sock(sk); + smc_close_cleanup_listen(sk); + flush_work(&smc->tcp_listen_work); + lock_sock(sk); + break; + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state == SMC_ACTIVE) { + /* send close request */ + rc = smc_close_final(conn); + sk->sk_state = SMC_PEERCLOSEWAIT1; + } else { + /* peer event has changed the state */ + goto again; + } + break; + case SMC_APPFINCLOSEWAIT: + /* socket already shutdown wr or both (active close) */ + if (txflags->peer_done_writing && + !txflags->peer_conn_closed) { + /* just shutdown wr done, send close request */ + rc = smc_close_final(conn); + } + sk->sk_state = SMC_CLOSED; + smc_close_wait_tx_pends(smc); + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_err != ECONNABORTED) { + /* confirm close from peer */ + rc = smc_close_final(conn); + if (rc) + break; + } + if (smc_cdc_rxed_any_close(conn)) + /* peer has closed the socket already */ + sk->sk_state = SMC_CLOSED; + else + /* peer has just issued a shutdown write */ + sk->sk_state = SMC_PEERFINCLOSEWAIT; + smc_close_wait_tx_pends(smc); + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + /* peer sending PeerConnectionClosed will cause transition */ + break; + case SMC_PROCESSABORT: + cancel_work_sync(&conn->tx_work); + smc_close_abort(conn); + sk->sk_state = SMC_CLOSED; + smc_close_wait_tx_pends(smc); + break; + case SMC_PEERABORTWAIT: + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(&smc->sk); + return rc; +} + +static void smc_close_passive_abort_received(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + + switch (sk->sk_state) { + case SMC_ACTIVE: + case SMC_APPFINCLOSEWAIT: + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + smc_close_abort(&smc->conn); + sk->sk_state = SMC_PROCESSABORT; + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (txflags->peer_done_writing && + !txflags->peer_conn_closed) { + /* just shutdown, but not yet closed locally */ + smc_close_abort(&smc->conn); + sk->sk_state = SMC_PROCESSABORT; + } else { + sk->sk_state = SMC_CLOSED; + } + break; + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; + case SMC_INIT: + case SMC_PROCESSABORT: + /* nothing to do, add tracing in future patch */ + break; + } +} + +/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort, + * or peer_done_writing. + * Called under tasklet context. + */ +void smc_close_passive_received(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *rxflags = + &smc->conn.local_rx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + int old_state; + + sk->sk_shutdown |= RCV_SHUTDOWN; + if (smc->clcsock && smc->clcsock->sk) + smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(&smc->sk, SOCK_DONE); + + old_state = sk->sk_state; + + if (rxflags->peer_conn_abort) { + smc_close_passive_abort_received(smc); + goto wakeup; + } + + switch (sk->sk_state) { + case SMC_INIT: + if (atomic_read(&smc->conn.bytes_to_rcv) || + (rxflags->peer_done_writing && + !rxflags->peer_conn_closed)) + sk->sk_state = SMC_APPCLOSEWAIT1; + else + sk->sk_state = SMC_CLOSED; + break; + case SMC_ACTIVE: + sk->sk_state = SMC_APPCLOSEWAIT1; + break; + case SMC_PEERCLOSEWAIT1: + if (rxflags->peer_done_writing) + sk->sk_state = SMC_PEERCLOSEWAIT2; + /* fall through to check for closing */ + case SMC_PEERCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + if (!smc_cdc_rxed_any_close(&smc->conn)) + break; + if (sock_flag(sk, SOCK_DEAD) && + (sk->sk_shutdown == SHUTDOWN_MASK)) { + /* smc_release has already been called locally */ + sk->sk_state = SMC_CLOSED; + } else { + /* just shutdown, but not yet closed locally */ + sk->sk_state = SMC_APPFINCLOSEWAIT; + } + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + case SMC_APPFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + case SMC_PROCESSABORT: + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + +wakeup: + if (old_state != sk->sk_state) + sk->sk_state_change(sk); + sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ + sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ + + if ((sk->sk_state == SMC_CLOSED) && + (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } +} + +void smc_close_sock_put_work(struct work_struct *work) +{ + struct smc_sock *smc = container_of(to_delayed_work(work), + struct smc_sock, + sock_put_work); + + smc->sk.sk_prot->unhash(&smc->sk); + sock_put(&smc->sk); +} + +int smc_close_shutdown_write(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; + struct sock *sk = &smc->sk; + int old_state; + int rc = 0; + + if (sock_flag(sk, SOCK_LINGER)) + timeout = sk->sk_lingertime; + +again: + old_state = sk->sk_state; + switch (old_state) { + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + /* send close wr request */ + rc = smc_close_wr(conn); + if (sk->sk_state == SMC_ACTIVE) + sk->sk_state = SMC_PEERCLOSEWAIT1; + else + goto again; + break; + case SMC_APPCLOSEWAIT1: + /* passive close */ + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + /* confirm close from peer */ + rc = smc_close_wr(conn); + sk->sk_state = SMC_APPCLOSEWAIT2; + break; + case SMC_APPCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_APPFINCLOSEWAIT: + case SMC_PROCESSABORT: + case SMC_PEERABORTWAIT: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(&smc->sk); + return rc; +} diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h new file mode 100644 index 000000000000..bc9a2df3633c --- /dev/null +++ b/net/smc/smc_close.h @@ -0,0 +1,28 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CLOSE_H +#define SMC_CLOSE_H + +#include <linux/workqueue.h> + +#include "smc.h" + +#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ) +#define SMC_CLOSE_SOCK_PUT_DELAY HZ + +void smc_close_wake_tx_prepared(struct smc_sock *smc); +void smc_close_active_abort(struct smc_sock *smc); +int smc_close_active(struct smc_sock *smc); +void smc_close_passive_received(struct smc_sock *smc); +void smc_close_sock_put_work(struct work_struct *work); +int smc_close_shutdown_write(struct smc_sock *smc); + +#endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c new file mode 100644 index 000000000000..0eac633fb354 --- /dev/null +++ b/net/smc/smc_core.c @@ -0,0 +1,682 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Basic Transport Functions exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/socket.h> +#include <linux/if_vlan.h> +#include <linux/random.h> +#include <linux/workqueue.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_clc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_wr.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_close.h" + +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY (600 * HZ) + +static u32 smc_lgr_num; /* unique link group number */ + +/* Register connection's alert token in our lookup structure. + * To use rbtrees we have to implement our own insert core. + * Requires @conns_lock + * @smc connection to register + * Returns 0 on success, != otherwise. + */ +static void smc_lgr_add_alert_token(struct smc_connection *conn) +{ + struct rb_node **link, *parent = NULL; + u32 token = conn->alert_token_local; + + link = &conn->lgr->conns_all.rb_node; + while (*link) { + struct smc_connection *cur = rb_entry(*link, + struct smc_connection, alert_node); + + parent = *link; + if (cur->alert_token_local > token) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + /* Put the new node there */ + rb_link_node(&conn->alert_node, parent, link); + rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); +} + +/* Register connection in link group by assigning an alert token + * registered in a search tree. + * Requires @conns_lock + * Note that '0' is a reserved value and not assigned. + */ +static void smc_lgr_register_conn(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + static atomic_t nexttoken = ATOMIC_INIT(0); + + /* find a new alert_token_local value not yet used by some connection + * in this link group + */ + sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ + while (!conn->alert_token_local) { + conn->alert_token_local = atomic_inc_return(&nexttoken); + if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) + conn->alert_token_local = 0; + } + smc_lgr_add_alert_token(conn); + conn->lgr->conns_num++; +} + +/* Unregister connection and reset the alert token of the given connection< + */ +static void __smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smc_link_group *lgr = conn->lgr; + + rb_erase(&conn->alert_node, &lgr->conns_all); + lgr->conns_num--; + conn->alert_token_local = 0; + conn->lgr = NULL; + sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ +} + +/* Unregister connection and trigger lgr freeing if applicable + */ +static void smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + int reduced = 0; + + write_lock_bh(&lgr->conns_lock); + if (conn->alert_token_local) { + reduced = 1; + __smc_lgr_unregister_conn(conn); + } + write_unlock_bh(&lgr->conns_lock); + if (reduced && !lgr->conns_num) + schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); +} + +static void smc_lgr_free_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(to_delayed_work(work), + struct smc_link_group, + free_work); + bool conns; + + spin_lock_bh(&smc_lgr_list.lock); + read_lock_bh(&lgr->conns_lock); + conns = RB_EMPTY_ROOT(&lgr->conns_all); + read_unlock_bh(&lgr->conns_lock); + if (!conns) { /* number of lgr connections is no longer zero */ + spin_unlock_bh(&smc_lgr_list.lock); + return; + } + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + spin_unlock_bh(&smc_lgr_list.lock); + smc_lgr_free(lgr); +} + +/* create a new SMC link group */ +static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + char *peer_systemid, unsigned short vlan_id) +{ + struct smc_link_group *lgr; + struct smc_link *lnk; + u8 rndvec[3]; + int rc = 0; + int i; + + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); + if (!lgr) { + rc = -ENOMEM; + goto out; + } + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->sync_err = false; + lgr->daddr = peer_in_addr; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + lgr->vlan_id = vlan_id; + rwlock_init(&lgr->sndbufs_lock); + rwlock_init(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + INIT_LIST_HEAD(&lgr->sndbufs[i]); + INIT_LIST_HEAD(&lgr->rmbs[i]); + } + smc_lgr_num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); + INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + lgr->conns_all = RB_ROOT; + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto free_lgr; + init_waitqueue_head(&lnk->wr_tx_wait); + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + init_completion(&lnk->llc_confirm); + init_completion(&lnk->llc_confirm_resp); + + smc->conn.lgr = lgr; + rwlock_init(&lgr->conns_lock); + spin_lock_bh(&smc_lgr_list.lock); + list_add(&lgr->list, &smc_lgr_list.list); + spin_unlock_bh(&smc_lgr_list.lock); + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +free_lgr: + kfree(lgr); +out: + return rc; +} + +static void smc_sndbuf_unuse(struct smc_connection *conn) +{ + if (conn->sndbuf_desc) { + conn->sndbuf_desc->used = 0; + conn->sndbuf_size = 0; + } +} + +static void smc_rmb_unuse(struct smc_connection *conn) +{ + if (conn->rmb_desc) { + conn->rmb_desc->used = 0; + conn->rmbe_size = 0; + } +} + +/* remove a finished connection from its link group */ +void smc_conn_free(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) + return; + smc_cdc_tx_dismiss_slots(conn); + smc_lgr_unregister_conn(conn); + smc_rmb_unuse(conn); + smc_sndbuf_unuse(conn); +} + +static void smc_link_clear(struct smc_link *lnk) +{ + lnk->peer_qpn = 0; + smc_ib_modify_qp_reset(lnk); + smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); + smc_wr_free_link_mem(lnk); +} + +static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *sndbuf_desc, *bf_desc; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], + list) { + list_del(&sndbuf_desc->list); + smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_uncompress_bufsize(i), + sndbuf_desc, DMA_TO_DEVICE); + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + } + } +} + +static void smc_lgr_free_rmbs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *rmb_desc, *bf_desc; + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], + list) { + list_del(&rmb_desc->list); + smc_ib_buf_unmap(lnk->smcibdev, + smc_uncompress_bufsize(i), + rmb_desc, DMA_FROM_DEVICE); + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + } + } +} + +/* remove a link group */ +void smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_rmbs(lgr); + smc_lgr_free_sndbufs(lgr); + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + kfree(lgr); +} + +/* terminate linkgroup abnormally */ +void smc_lgr_terminate(struct smc_link_group *lgr) +{ + struct smc_connection *conn; + struct smc_sock *smc; + struct rb_node *node; + + spin_lock_bh(&smc_lgr_list.lock); + if (list_empty(&lgr->list)) { + /* termination already triggered */ + spin_unlock_bh(&smc_lgr_list.lock); + return; + } + /* do not use this link group for new connections */ + list_del_init(&lgr->list); + spin_unlock_bh(&smc_lgr_list.lock); + + write_lock_bh(&lgr->conns_lock); + node = rb_first(&lgr->conns_all); + while (node) { + conn = rb_entry(node, struct smc_connection, alert_node); + smc = container_of(conn, struct smc_sock, conn); + sock_hold(&smc->sk); + __smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); + sock_put(&smc->sk); + node = rb_first(&lgr->conns_all); + } + write_unlock_bh(&lgr->conns_lock); +} + +/* Determine vlan of internal TCP socket. + * @vlan_id: address to store the determined vlan id into + */ +static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc = 0; + + *vlan_id = 0; + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + if (is_vlan_dev(dst->dev)) + *vlan_id = vlan_dev_vlan_id(dst->dev); + +out_rel: + dst_release(dst); +out: + return rc; +} + +/* determine the link gid matching the vlan id of the link group */ +static int smc_link_determine_gid(struct smc_link_group *lgr) +{ + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct ib_gid_attr gattr; + union ib_gid gid; + int i; + + if (!lgr->vlan_id) { + lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; + return 0; + } + + for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; + i++) { + if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, + &gattr)) + continue; + if (gattr.ndev && + (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { + lnk->gid = gid; + return 0; + } + } + return -ENODEV; +} + +/* create a new SMC connection (and a new link group if necessary) */ +int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr; + unsigned short vlan_id; + enum smc_lgr_role role; + int local_contact = SMC_FIRST_CONTACT; + int rc = 0; + + role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); + if (rc) + return rc; + + if ((role == SMC_CLNT) && srv_first_contact) + /* create new link group as well */ + goto create; + + /* determine if an existing link group can be reused */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + write_lock_bh(&lgr->conns_lock); + if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + !lgr->sync_err && + (lgr->role == role) && + (lgr->vlan_id == vlan_id) && + ((role == SMC_CLNT) || + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + /* link group found */ + local_contact = SMC_REUSE_CONTACT; + conn->lgr = lgr; + smc_lgr_register_conn(conn); /* add smc conn to lgr */ + write_unlock_bh(&lgr->conns_lock); + break; + } + write_unlock_bh(&lgr->conns_lock); + } + spin_unlock_bh(&smc_lgr_list.lock); + + if (role == SMC_CLNT && !srv_first_contact && + (local_contact == SMC_FIRST_CONTACT)) { + /* Server reuses a link group, but Client wants to start + * a new one + * send out_of_sync decline, reason synchr. error + */ + return -ENOLINK; + } + +create: + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport, + lcl->id_for_peer, vlan_id); + if (rc) + goto out; + smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_link_determine_gid(conn->lgr); + } + conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; + conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&conn->acurs_lock); +#endif + +out: + return rc ? rc : local_contact; +} + +/* try to reuse a sndbuf description slot of the sndbufs list for a certain + * buf_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *sndbuf_slot; + + read_lock_bh(&lgr->sndbufs_lock); + list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize], + list) { + if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->sndbufs_lock); + return sndbuf_slot; + } + } + read_unlock_bh(&lgr->sndbufs_lock); + return NULL; +} + +/* try to reuse an rmb description slot of the rmbs list for a certain + * rmbe_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *rmb_slot; + + read_lock_bh(&lgr->rmbs_lock); + list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize], + list) { + if (cmpxchg(&rmb_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->rmbs_lock); + return rmb_slot; + } + } + read_unlock_bh(&lgr->rmbs_lock); + return NULL; +} + +/* one of the conditions for announcing a receiver's current window size is + * that it "results in a minimum increase in the window size of 10% of the + * receive buffer space" [RFC7609] + */ +static inline int smc_rmb_wnd_update_limit(int rmbe_size) +{ + return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); +} + +/* create the tx buffer for an SMC socket */ +int smc_sndbuf_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *sndbuf_desc; + int rc; + + /* use socket send buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable sndbuf_slot in the link group */ + sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); + if (sndbuf_desc) { + memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new send buffer */ + sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); + if (!sndbuf_desc) + break; /* give up with -ENOMEM */ + sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!sndbuf_desc->cpu_addr) { + kfree(sndbuf_desc); + sndbuf_desc = NULL; + /* if send buffer allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, sndbuf_desc, + DMA_TO_DEVICE); + if (rc) { + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + sndbuf_desc = NULL; + continue; /* if mapping failed, try smaller one */ + } + sndbuf_desc->used = 1; + write_lock_bh(&lgr->sndbufs_lock); + list_add(&sndbuf_desc->list, + &lgr->sndbufs[tmp_bufsize_short]); + write_unlock_bh(&lgr->sndbufs_lock); + break; + } + if (sndbuf_desc && sndbuf_desc->cpu_addr) { + conn->sndbuf_desc = sndbuf_desc; + conn->sndbuf_size = tmp_bufsize; + smc->sk.sk_sndbuf = tmp_bufsize * 2; + atomic_set(&conn->sndbuf_space, tmp_bufsize); + return 0; + } else { + return -ENOMEM; + } +} + +/* create the RMB for an SMC socket (even though the SMC protocol + * allows more than one RMB-element per RMB, the Linux implementation + * uses just one RMB-element per RMB, i.e. uses an extra RMB for every + * connection in a link group + */ +int smc_rmb_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *rmb_desc; + int rc; + + /* use socket recv buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable rmb_slot in the link group */ + rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short); + if (rmb_desc) { + memset(rmb_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new RMB */ + rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); + if (!rmb_desc) + break; /* give up with -ENOMEM */ + rmb_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!rmb_desc->cpu_addr) { + kfree(rmb_desc); + rmb_desc = NULL; + /* if RMB allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, rmb_desc, + DMA_FROM_DEVICE); + if (rc) { + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + rmb_desc = NULL; + continue; /* if mapping failed, try smaller one */ + } + rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + &rmb_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) { + smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, rmb_desc, + DMA_FROM_DEVICE); + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + rmb_desc = NULL; + continue; + } + rmb_desc->used = 1; + write_lock_bh(&lgr->rmbs_lock); + list_add(&rmb_desc->list, + &lgr->rmbs[tmp_bufsize_short]); + write_unlock_bh(&lgr->rmbs_lock); + break; + } + if (rmb_desc && rmb_desc->cpu_addr) { + conn->rmb_desc = rmb_desc; + conn->rmbe_size = tmp_bufsize; + conn->rmbe_size_short = tmp_bufsize_short; + smc->sk.sk_rcvbuf = tmp_bufsize * 2; + atomic_set(&conn->bytes_to_rcv, 0); + conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize); + return 0; + } else { + return -ENOMEM; + } +} + +static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) +{ + int i; + + for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { + if (!test_and_set_bit(i, lgr->rtokens_used_mask)) + return i; + } + return -ENOSPC; +} + +/* save rkey and dma_addr received from peer during clc handshake */ +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc) +{ + u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr); + struct smc_link_group *lgr = conn->lgr; + u32 rkey = ntohl(clc->rmb_rkey); + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && + test_bit(i, lgr->rtokens_used_mask)) { + conn->rtoken_idx = i; + return 0; + } + } + conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr); + if (conn->rtoken_idx < 0) + return conn->rtoken_idx; + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey; + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr; + return 0; +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h new file mode 100644 index 000000000000..27eb38056a27 --- /dev/null +++ b/net/smc/smc_core.h @@ -0,0 +1,181 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for SMC Connections, Link Groups and Links + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CORE_H +#define _SMC_CORE_H + +#include <linux/atomic.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_ib.h" + +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ + +struct smc_lgr_list { /* list of link group definition */ + struct list_head list; + spinlock_t lock; /* protects list of link groups */ +}; + +extern struct smc_lgr_list smc_lgr_list; /* list of link groups */ + +enum smc_lgr_role { /* possible roles of a link group */ + SMC_CLNT, /* client */ + SMC_SERV /* server */ +}; + +#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ + +struct smc_wr_buf { + u8 raw[SMC_WR_BUF_SIZE]; +}; + +struct smc_link { + struct smc_ib_device *smcibdev; /* ib-device */ + u8 ibport; /* port - values 1 | 2 */ + struct ib_pd *roce_pd; /* IB protection domain, + * unique for every RoCE QP + */ + struct ib_qp *roce_qp; /* IB queue pair */ + struct ib_qp_attr qp_attr; /* IB queue pair attributes */ + + struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ + struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ + struct ib_sge *wr_tx_sges; /* WR send gather meta data */ + struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + /* above four vectors have wr_tx_cnt elements and use the same index */ + dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + atomic_long_t wr_tx_id; /* seq # of last sent WR */ + unsigned long *wr_tx_mask; /* bit mask of used indexes */ + u32 wr_tx_cnt; /* number of WR send buffers */ + wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + + struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ + struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ + /* above three vectors have wr_rx_cnt elements and use the same index */ + dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + u64 wr_rx_id; /* seq # of last recv WR */ + u32 wr_rx_cnt; /* number of WR recv buffers */ + + union ib_gid gid; /* gid matching used vlan id */ + u32 peer_qpn; /* QP number of peer */ + enum ib_mtu path_mtu; /* used mtu */ + enum ib_mtu peer_mtu; /* mtu size of peer */ + u32 psn_initial; /* QP tx initial packet seqno */ + u32 peer_psn; /* QP rx initial packet seqno */ + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ + u8 link_id; /* unique # within link group */ + struct completion llc_confirm; /* wait for rx of conf link */ + struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ +}; + +/* For now we just allow one parallel link per link group. The SMC protocol + * allows more (up to 8). + */ +#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_SINGLE_LINK 0 + +#define SMC_FIRST_CONTACT 1 /* first contact to a peer */ +#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/ + +/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ +struct smc_buf_desc { + struct list_head list; + u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; + /* mapped address of buffer */ + void *cpu_addr; /* virtual address of buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: + * rkey provided to peer + */ + u32 used; /* currently used / unused */ +}; + +struct smc_rtoken { /* address/key of remote RMB */ + u64 dma_addr; + u32 rkey; +}; + +#define SMC_LGR_ID_SIZE 4 + +struct smc_link_group { + struct list_head list; + enum smc_lgr_role role; /* client or server */ + __be32 daddr; /* destination ip address */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct rb_root conns_all; /* connection tree */ + rwlock_t conns_lock; /* protects conns_all */ + unsigned int conns_num; /* current # of connections */ + unsigned short vlan_id; /* vlan id of link group */ + + struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ + rwlock_t sndbufs_lock; /* protects tx buffers */ + struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ + rwlock_t rmbs_lock; /* protects rx buffers */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + unsigned long rtokens_used_mask[BITS_TO_LONGS( + SMC_RMBS_PER_LGR_MAX)]; + /* used rtoken elements */ + + u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ + struct delayed_work free_work; /* delayed freeing of an lgr */ + bool sync_err; /* lgr no longer fits to peer */ +}; + +/* Find the connection associated with the given alert token in the link group. + * To use rbtrees we have to implement our own search core. + * Requires @conns_lock + * @token alert token to search for + * @lgr link group to search in + * Returns connection associated with token if found, NULL otherwise. + */ +static inline struct smc_connection *smc_lgr_find_conn( + u32 token, struct smc_link_group *lgr) +{ + struct smc_connection *res = NULL; + struct rb_node *node; + + node = lgr->conns_all.rb_node; + while (node) { + struct smc_connection *cur = rb_entry(node, + struct smc_connection, alert_node); + + if (cur->alert_token_local > token) { + node = node->rb_left; + } else { + if (cur->alert_token_local < token) { + node = node->rb_right; + } else { + res = cur; + break; + } + } + } + + return res; +} + +struct smc_sock; +struct smc_clc_msg_accept_confirm; + +void smc_lgr_free(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr); +int smc_sndbuf_create(struct smc_sock *smc); +int smc_rmb_create(struct smc_sock *smc); +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc); + +#endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c new file mode 100644 index 000000000000..d2d01cf70224 --- /dev/null +++ b/net/smc/smc_diag.c @@ -0,0 +1,215 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Monitoring SMC transport protocol sockets + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sock_diag.h> +#include <linux/inet_diag.h> +#include <linux/smc_diag.h> +#include <net/netlink.h> +#include <net/smc.h> + +#include "smc.h" +#include "smc_core.h" + +static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + r->diag_family = sk->sk_family; + if (!smc->clcsock) + return; + r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); + r->id.idiag_dport = smc->clcsock->sk->sk_dport; + r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +} + +static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct smc_diag_msg *r, + struct user_namespace *user_ns) +{ + if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) + return 1; + + r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_inode = sock_i_ino(sk); + return 0; +} + +static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct smc_diag_req *req, + struct nlattr *bc) +{ + struct smc_sock *smc = smc_sk(sk); + struct user_namespace *user_ns; + struct smc_diag_msg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + smc_diag_msg_common_fill(r, sk); + r->diag_state = sk->sk_state; + r->diag_fallback = smc->use_fallback; + user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); + if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) + goto errout; + + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) { + struct smc_connection *conn = &smc->conn; + struct smc_diag_conninfo cinfo = { + .token = conn->alert_token_local, + .sndbuf_size = conn->sndbuf_size, + .rmbe_size = conn->rmbe_size, + .peer_rmbe_size = conn->peer_rmbe_size, + + .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, + .rx_prod.count = conn->local_rx_ctrl.prod.count, + .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, + .rx_cons.count = conn->local_rx_ctrl.cons.count, + + .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, + .tx_prod.count = conn->local_tx_ctrl.prod.count, + .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, + .tx_cons.count = conn->local_tx_ctrl.cons.count, + + .tx_prod_flags = + *(u8 *)&conn->local_tx_ctrl.prod_flags, + .tx_conn_state_flags = + *(u8 *)&conn->local_tx_ctrl.conn_state_flags, + .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, + .rx_conn_state_flags = + *(u8 *)&conn->local_rx_ctrl.conn_state_flags, + + .tx_prep.wrap = conn->tx_curs_prep.wrap, + .tx_prep.count = conn->tx_curs_prep.count, + .tx_sent.wrap = conn->tx_curs_sent.wrap, + .tx_sent.count = conn->tx_curs_sent.count, + .tx_fin.wrap = conn->tx_curs_fin.wrap, + .tx_fin.count = conn->tx_curs_fin.count, + }; + + if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) + goto errout; + } + + if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) { + struct smc_diag_lgrinfo linfo = { + .role = smc->conn.lgr->role, + .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, + .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + }; + + memcpy(linfo.lnk[0].ibname, + smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, + smc->conn.lgr->lnk[0].gid.raw); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, + smc->conn.lgr->lnk[0].peer_gid); + + if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) + goto errout; + } + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *bc = NULL; + struct hlist_head *head; + struct sock *sk; + int rc = 0; + + read_lock(&smc_proto.h.smc_hash->lock); + head = &smc_proto.h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc) + break; + } + +out: + read_unlock(&smc_proto.h.smc_hash->lock); + return rc; +} + +static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + struct net *net = sock_net(skb->sk); + + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { + { + struct netlink_dump_control c = { + .dump = smc_diag_dump, + .min_dump_alloc = SKB_WITH_OVERHEAD(32768), + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + } + return 0; +} + +static const struct sock_diag_handler smc_diag_handler = { + .family = AF_SMC, + .dump = smc_diag_handler_dump, +}; + +static int __init smc_diag_init(void) +{ + return sock_diag_register(&smc_diag_handler); +} + +static void __exit smc_diag_exit(void) +{ + sock_diag_unregister(&smc_diag_handler); +} + +module_init(smc_diag_init); +module_exit(smc_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c new file mode 100644 index 000000000000..e6743c008ac5 --- /dev/null +++ b/net/smc/smc_ib.c @@ -0,0 +1,466 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * IB infrastructure: + * Establish SMC-R as an Infiniband Client to be notified about added and + * removed IB devices of type RDMA. + * Determine device and port characteristics for these IB devices. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/random.h> +#include <linux/workqueue.h> +#include <rdma/ib_verbs.h> + +#include "smc_pnet.h" +#include "smc_ib.h" +#include "smc_core.h" +#include "smc_wr.h" +#include "smc.h" + +#define SMC_QP_MIN_RNR_TIMER 5 +#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ +#define SMC_QP_RETRY_CNT 7 /* 7: infinite */ +#define SMC_QP_RNR_RETRY 7 /* 7: infinite */ + +struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ + .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), + .list = LIST_HEAD_INIT(smc_ib_devices.list), +}; + +#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%" + +u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system + * identifier + */ + +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct ib_mr **mr) +{ + int rc; + + if (*mr) + return 0; /* already done */ + + /* obtain unique key - + * next invocation of get_dma_mr returns a different key! + */ + *mr = pd->device->get_dma_mr(pd, access_flags); + rc = PTR_ERR_OR_ZERO(*mr); + if (IS_ERR(*mr)) + *mr = NULL; + return rc; +} + +static int smc_ib_modify_qp_init(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = lnk->ibport; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_REMOTE_WRITE; + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_ACCESS_FLAGS | IB_QP_PORT); +} + +static int smc_ib_modify_qp_rtr(struct smc_link *lnk) +{ + enum ib_qp_attr_mask qp_attr_mask = + IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | + IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTR; + qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); + qp_attr.ah_attr.port_num = lnk->ibport; + qp_attr.ah_attr.ah_flags = IB_AH_GRH; + qp_attr.ah_attr.grh.hop_limit = 1; + memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid, + sizeof(lnk->peer_gid)); + memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); + qp_attr.dest_qp_num = lnk->peer_qpn; + qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ + qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming + * requests + */ + qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; + + return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); +} + +int smc_ib_modify_qp_rts(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ + qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ + qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ + qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ + qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and + * atomic ops allowed + */ + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | + IB_QP_SQ_PSN | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC); +} + +int smc_ib_modify_qp_reset(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RESET; + return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); +} + +int smc_ib_ready_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = + container_of(lnk, struct smc_link_group, lnk[0]); + int rc = 0; + + rc = smc_ib_modify_qp_init(lnk); + if (rc) + goto out; + + rc = smc_ib_modify_qp_rtr(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + IB_CQ_SOLICITED_MASK); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + + if (lgr->role == SMC_SERV) { + rc = smc_ib_modify_qp_rts(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + } +out: + return rc; +} + +/* process context wrapper for might_sleep smc_ib_remember_port_attr */ +static void smc_ib_port_event_work(struct work_struct *work) +{ + struct smc_ib_device *smcibdev = container_of( + work, struct smc_ib_device, port_event_work); + u8 port_idx; + + for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { + smc_ib_remember_port_attr(smcibdev, port_idx + 1); + clear_bit(port_idx, &smcibdev->port_event_mask); + } +} + +/* can be called in IRQ context */ +static void smc_ib_global_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + struct smc_ib_device *smcibdev; + u8 port_idx; + + smcibdev = container_of(handler, struct smc_ib_device, event_handler); + if (!smc_pnet_find_ib(smcibdev->ibdev->name)) + return; + + switch (ibevent->event) { + case IB_EVENT_PORT_ERR: + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); + /* fall through */ + case IB_EVENT_DEVICE_FATAL: + /* tbd in follow-on patch: + * abnormal close of corresponding connections + */ + break; + case IB_EVENT_PORT_ACTIVE: + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); + break; + default: + break; + } +} + +void smc_ib_dealloc_protection_domain(struct smc_link *lnk) +{ + ib_dealloc_pd(lnk->roce_pd); + lnk->roce_pd = NULL; +} + +int smc_ib_create_protection_domain(struct smc_link *lnk) +{ + int rc; + + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + rc = PTR_ERR_OR_ZERO(lnk->roce_pd); + if (IS_ERR(lnk->roce_pd)) + lnk->roce_pd = NULL; + return rc; +} + +static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) +{ + switch (ibevent->event) { + case IB_EVENT_DEVICE_FATAL: + case IB_EVENT_GID_CHANGE: + case IB_EVENT_PORT_ERR: + case IB_EVENT_QP_ACCESS_ERR: + /* tbd in follow-on patch: + * abnormal close of corresponding connections + */ + break; + default: + break; + } +} + +void smc_ib_destroy_queue_pair(struct smc_link *lnk) +{ + ib_destroy_qp(lnk->roce_qp); + lnk->roce_qp = NULL; +} + +/* create a queue pair within the protection domain for a link */ +int smc_ib_create_queue_pair(struct smc_link *lnk) +{ + struct ib_qp_init_attr qp_attr = { + .event_handler = smc_ib_qp_event_handler, + .qp_context = lnk, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, + .srq = NULL, + .cap = { + .max_send_wr = SMC_WR_BUF_CNT, + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_send_sge = SMC_IB_MAX_SEND_SGE, + .max_recv_sge = 1, + .max_inline_data = SMC_WR_TX_SIZE, + }, + .sq_sig_type = IB_SIGNAL_REQ_WR, + .qp_type = IB_QPT_RC, + }; + int rc; + + lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); + rc = PTR_ERR_OR_ZERO(lnk->roce_qp); + if (IS_ERR(lnk->roce_qp)) + lnk->roce_qp = NULL; + else + smc_wr_remember_qp_attr(lnk); + return rc; +} + +/* map a new TX or RX buffer to DMA */ +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + int rc = 0; + + if (buf_slot->dma_addr[SMC_SINGLE_LINK]) + return rc; /* already mapped */ + buf_slot->dma_addr[SMC_SINGLE_LINK] = + ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, + buf_size, data_direction); + if (ib_dma_mapping_error(smcibdev->ibdev, + buf_slot->dma_addr[SMC_SINGLE_LINK])) + rc = -EIO; + return rc; +} + +void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) + return; /* already unmapped */ + ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, + data_direction); + buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; +} + +static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct net_device *ndev; + int rc; + + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, + &smcibdev->gid[ibport - 1], NULL); + /* the SMC protocol requires specification of the roce MAC address; + * if net_device cannot be determined, it can be derived from gid 0 + */ + ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); + if (ndev) { + memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + } else if (!rc) { + memcpy(&smcibdev->mac[ibport - 1][0], + &smcibdev->gid[ibport - 1].raw[8], 3); + memcpy(&smcibdev->mac[ibport - 1][3], + &smcibdev->gid[ibport - 1].raw[13], 3); + smcibdev->mac[ibport - 1][0] &= ~0x02; + } + return rc; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); + if (rc) + goto out; + if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, + sizeof(local_systemid)) && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) +{ + struct ib_cq_init_attr cqattr = { + .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; + long rc; + + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; + return rc; + } + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; + } + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + smc_wr_add_dev(smcibdev); + smcibdev->initialized = 1; + return rc; + +err: + ib_destroy_cq(smcibdev->roce_cq_send); + return rc; +} + +static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) +{ + if (!smcibdev->initialized) + return; + smc_wr_remove_dev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); +} + +static struct ib_client smc_ib_client; + +/* callback function for ib_register_client() */ +static void smc_ib_add_dev(struct ib_device *ibdev) +{ + struct smc_ib_device *smcibdev; + + if (ibdev->node_type != RDMA_NODE_IB_CA) + return; + + smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); + if (!smcibdev) + return; + + smcibdev->ibdev = ibdev; + INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); + + spin_lock(&smc_ib_devices.lock); + list_add_tail(&smcibdev->list, &smc_ib_devices.list); + spin_unlock(&smc_ib_devices.lock); + ib_set_client_data(ibdev, &smc_ib_client, smcibdev); +} + +/* callback function for ib_register_client() */ +static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) +{ + struct smc_ib_device *smcibdev; + + smcibdev = ib_get_client_data(ibdev, &smc_ib_client); + ib_set_client_data(ibdev, &smc_ib_client, NULL); + spin_lock(&smc_ib_devices.lock); + list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ + spin_unlock(&smc_ib_devices.lock); + smc_pnet_remove_by_ibdev(smcibdev); + smc_ib_cleanup_per_ibdev(smcibdev); + kfree(smcibdev); +} + +static struct ib_client smc_ib_client = { + .name = "smc_ib", + .add = smc_ib_add_dev, + .remove = smc_ib_remove_dev, +}; + +int __init smc_ib_register_client(void) +{ + return ib_register_client(&smc_ib_client); +} + +void smc_ib_unregister_client(void) +{ + ib_unregister_client(&smc_ib_client); +} diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h new file mode 100644 index 000000000000..a95f74bb5569 --- /dev/null +++ b/net/smc/smc_ib.h @@ -0,0 +1,71 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for IB environment + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_IB_H +#define _SMC_IB_H + +#include <linux/if_ether.h> +#include <rdma/ib_verbs.h> + +#define SMC_MAX_PORTS 2 /* Max # of ports */ +#define SMC_GID_SIZE sizeof(union ib_gid) + +#define SMC_IB_MAX_SEND_SGE 2 + +struct smc_ib_devices { /* list of smc ib devices definition */ + struct list_head list; + spinlock_t lock; /* protects list of smc ib devices */ +}; + +extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ + +struct smc_ib_device { /* ib-device infos for smc */ + struct list_head list; + struct ib_device *ibdev; + struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ + struct ib_event_handler event_handler; /* global ib_event handler */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + char mac[SMC_MAX_PORTS][ETH_ALEN]; + /* mac address per port*/ + union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 initialized : 1; /* ib dev CQ, evthdl done */ + struct work_struct port_event_work; + unsigned long port_event_mask; +}; + +struct smc_buf_desc; +struct smc_link; + +int smc_ib_register_client(void) __init; +void smc_ib_unregister_client(void); +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_dealloc_protection_domain(struct smc_link *lnk); +int smc_ib_create_protection_domain(struct smc_link *lnk); +void smc_ib_destroy_queue_pair(struct smc_link *lnk); +int smc_ib_create_queue_pair(struct smc_link *lnk); +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct ib_mr **mr); +int smc_ib_ready_link(struct smc_link *lnk); +int smc_ib_modify_qp_rts(struct smc_link *lnk); +int smc_ib_modify_qp_reset(struct smc_link *lnk); +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); + + +#endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c new file mode 100644 index 000000000000..c2f9165d13ef --- /dev/null +++ b/net/smc/smc_llc.c @@ -0,0 +1,158 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Link Layer Control (LLC) + * + * For now, we only support the necessary "confirm link" functionality + * which happens for the first RoCE link after successful CLC handshake. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <net/tcp.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_llc.h" + +/********************************** send *************************************/ + +struct smc_llc_tx_pend { +}; + +/* handler for send/transmission completion of an LLC msg */ +static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + /* future work: handle wc_status error for recovery and failover */ +} + +/** + * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits + * @link: Pointer to SMC link used for sending LLC control message. + * @wr_buf: Out variable returning pointer to work request payload buffer. + * @pend: Out variable returning pointer to private pending WR tracking. + * It's the context the transmit complete handler will get. + * + * Reserves and pre-fills an entry for a pending work request send/tx. + * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx. + * Can sleep due to smc_get_ctrl_buf (if not in softirq context). + * + * Return: 0 on success, otherwise an error value. + */ +static int smc_llc_add_pending_send(struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)"); + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)"); + return 0; +} + +/* high-level API to send LLC confirm link */ +int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + struct smc_llc_msg_confirm_link *confllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + confllc = (struct smc_llc_msg_confirm_link *)wr_buf; + memset(confllc, 0, sizeof(*confllc)); + confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; + confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + if (reqresp == SMC_LLC_RESP) + confllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(confllc->sender_mac, mac, ETH_ALEN); + memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); + hton24(confllc->sender_qp_num, link->roce_qp->qp_num); + /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */ + memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LINKS_PER_LGR_MAX; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/********************************* receive ***********************************/ + +static void smc_llc_rx_confirm_link(struct smc_link *link, + struct smc_llc_msg_confirm_link *llc) +{ + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (lgr->role == SMC_SERV) + complete(&link->llc_confirm_resp); + } else { + if (lgr->role == SMC_CLNT) { + link->link_id = llc->link_num; + complete(&link->llc_confirm); + } + } +} + +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK) + smc_llc_rx_confirm_link(link, &llc->confirm_link); +} + +/***************************** init, exit, misc ******************************/ + +static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK + }, + { + .handler = NULL, + } +}; + +int __init smc_llc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_llc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h new file mode 100644 index 000000000000..b472f853953a --- /dev/null +++ b/net/smc/smc_llc.h @@ -0,0 +1,63 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for LLC (link layer control) message handling + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_LLC_H +#define SMC_LLC_H + +#include "smc_wr.h" + +#define SMC_LLC_FLAG_RESP 0x80 + +#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) + +enum smc_llc_reqresp { + SMC_LLC_REQ, + SMC_LLC_RESP +}; + +enum smc_llc_msg_type { + SMC_LLC_CONFIRM_LINK = 0x01, +}; + +#define SMC_LLC_DATA_LEN 40 + +struct smc_llc_hdr { + struct smc_wr_rx_hdr common; + u8 length; /* 44 */ + u8 reserved; + u8 flags; +}; + +struct smc_llc_msg_confirm_link { /* type 0x01 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 link_uid[SMC_LGR_ID_SIZE]; + u8 max_links; + u8 reserved[9]; +}; + +union smc_llc_msg { + struct smc_llc_msg_confirm_link confirm_link; + struct { + struct smc_llc_hdr hdr; + u8 data[SMC_LLC_DATA_LEN]; + } raw; +}; + +/* transmit */ +int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, + enum smc_llc_reqresp reqresp); +int smc_llc_init(void) __init; + +#endif /* SMC_LLC_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c new file mode 100644 index 000000000000..9d3e7fb8348d --- /dev/null +++ b/net/smc/smc_pnet.c @@ -0,0 +1,534 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to configure an SMC-R PNET table + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/if.h> +#include <uapi/linux/smc.h> + +#include <rdma/ib_verbs.h> + +#include "smc_pnet.h" +#include "smc_ib.h" + +#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ + +static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { + [SMC_PNETID_NAME] = { + .type = NLA_NUL_STRING, + .len = SMC_MAX_PNET_ID_LEN - 1 + }, + [SMC_PNETID_ETHNAME] = { + .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 + }, + [SMC_PNETID_IBNAME] = { + .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1 + }, + [SMC_PNETID_IBPORT] = { .type = NLA_U8 } +}; + +static struct genl_family smc_pnet_nl_family; + +/** + * struct smc_pnettable - SMC PNET table anchor + * @lock: Lock for list action + * @pnetlist: List of PNETIDs + */ +static struct smc_pnettable { + rwlock_t lock; + struct list_head pnetlist; +} smc_pnettable = { + .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist), + .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock) +}; + +/** + * struct smc_pnetentry - pnet identifier name entry + * @list: List node. + * @pnet_name: Pnet identifier name + * @ndev: pointer to network device. + * @smcibdev: Pointer to IB device. + */ +struct smc_pnetentry { + struct list_head list; + char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + struct net_device *ndev; + struct smc_ib_device *smcibdev; + u8 ib_port; +}; + +/* Check if two RDMA device entries are identical. Use device name and port + * number for comparison. + */ +static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname, + u8 ibport) +{ + return pnetelem->ib_port == ibport && + !strncmp(pnetelem->smcibdev->ibdev->name, ibname, + sizeof(pnetelem->smcibdev->ibdev->name)); +} + +/* Find a pnetid in the pnet table. + */ +static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *found_pnetelem = NULL; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + found_pnetelem = pnetelem; + break; + } + } + read_unlock(&smc_pnettable.lock); + return found_pnetelem; +} + +/* Remove a pnetid from the pnet table. + */ +static int smc_pnet_remove_by_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given network device from the pnet table. + */ +static int smc_pnet_remove_by_ndev(struct net_device *ndev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->ndev == ndev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given ib device from the pnet table. + */ +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->smcibdev == ibdev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. + */ +static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) +{ + struct smc_pnetentry *pnetelem; + int rc = -EEXIST; + + write_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name, + sizeof(new_pnetelem->pnet_name)) || + !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name, + sizeof(new_pnetelem->ndev->name)) || + smc_pnet_same_ibname(pnetelem, + new_pnetelem->smcibdev->ibdev->name, + new_pnetelem->ib_port)) + goto found; + } + list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); + rc = 0; +found: + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* The limit for pnetid is 16 characters. + * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. + * Lower case letters are converted to upper case. + * Interior blanks should not be used. + */ +static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) +{ + char *bf = skip_spaces(pnet_name); + size_t len = strlen(bf); + char *end = bf + len; + + if (!len) + return false; + while (--end >= bf && isspace(*end)) + ; + if (end - bf >= SMC_MAX_PNET_ID_LEN) + return false; + while (bf <= end) { + if (!isalnum(*bf)) + return false; + *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; + bf++; + } + *pnetid = '\0'; + return true; +} + +/* Find an infiniband device by a given name. The device might not exist. */ +struct smc_ib_device *smc_pnet_find_ib(char *ib_name) +{ + struct smc_ib_device *ibdev; + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (!strncmp(ibdev->ibdev->name, ib_name, + sizeof(ibdev->ibdev->name))) { + goto out; + } + } + ibdev = NULL; +out: + spin_unlock(&smc_ib_devices.lock); + return ibdev; +} + +/* Parse the supplied netlink attributes and fill a pnetentry structure. + * For ethernet and infiniband device names verify that the devices exist. + */ +static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem, + struct nlattr *tb[]) +{ + char *string, *ibname = NULL; + int rc = 0; + + memset(pnetelem, 0, sizeof(*pnetelem)); + INIT_LIST_HEAD(&pnetelem->list); + if (tb[SMC_PNETID_NAME]) { + string = (char *)nla_data(tb[SMC_PNETID_NAME]); + if (!smc_pnetid_valid(string, pnetelem->pnet_name)) { + rc = -EINVAL; + goto error; + } + } + if (tb[SMC_PNETID_ETHNAME]) { + string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); + pnetelem->ndev = dev_get_by_name(net, string); + if (!pnetelem->ndev) + return -ENOENT; + } + if (tb[SMC_PNETID_IBNAME]) { + ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + ibname = strim(ibname); + pnetelem->smcibdev = smc_pnet_find_ib(ibname); + if (!pnetelem->smcibdev) { + rc = -ENOENT; + goto error; + } + } + if (tb[SMC_PNETID_IBPORT]) { + pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (pnetelem->ib_port > SMC_MAX_PORTS) { + rc = -EINVAL; + goto error; + } + } + return 0; + +error: + if (pnetelem->ndev) + dev_put(pnetelem->ndev); + return rc; +} + +/* Convert an smc_pnetentry to a netlink attribute sequence */ +static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) +{ + if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) || + nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) || + nla_put_string(msg, SMC_PNETID_IBNAME, + pnetelem->smcibdev->ibdev->name) || + nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) + return -1; + return 0; +} + +/* Retrieve one PNETID entry */ +static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem; + struct sk_buff *msg; + void *hdr; + int rc; + + pnetelem = smc_pnet_find_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); + if (!pnetelem) + return -ENOENT; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &smc_pnet_nl_family, 0, SMC_PNETID_GET); + if (!hdr) { + rc = -EMSGSIZE; + goto err_out; + } + + if (smc_pnet_set_nla(msg, pnetelem)) { + rc = -ENOBUFS; + goto err_out; + } + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +err_out: + nlmsg_free(msg); + return rc; +} + +static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct smc_pnetentry *pnetelem; + int rc; + + pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); + if (!pnetelem) + return -ENOMEM; + rc = smc_pnet_fill_entry(net, pnetelem, info->attrs); + if (!rc) + rc = smc_pnet_enter(pnetelem); + if (rc) { + kfree(pnetelem); + return rc; + } + rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); + if (rc) + smc_pnet_remove_by_pnetid(pnetelem->pnet_name); + return rc; +} + +static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) +{ + return smc_pnet_remove_by_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); +} + +static int smc_pnet_dump_start(struct netlink_callback *cb) +{ + cb->args[0] = 0; + return 0; +} + +static int smc_pnet_dumpinfo(struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, + struct smc_pnetentry *pnetelem) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, + flags, SMC_PNETID_GET); + if (!hdr) + return -ENOMEM; + if (smc_pnet_set_nla(skb, pnetelem) < 0) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_pnetentry *pnetelem; + int idx = 0; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (idx++ < cb->args[0]) + continue; + if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + pnetelem)) { + --idx; + break; + } + } + cb->args[0] = idx; + read_unlock(&smc_pnettable.lock); + return skb->len; +} + +/* Remove and delete all pnetids from pnet table. + */ +static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + } + write_unlock(&smc_pnettable.lock); + return 0; +} + +/* SMC_PNETID generic netlink operation definition */ +static const struct genl_ops smc_pnet_ops[] = { + { + .cmd = SMC_PNETID_GET, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_get, + .dumpit = smc_pnet_dump, + .start = smc_pnet_dump_start + }, + { + .cmd = SMC_PNETID_ADD, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_add + }, + { + .cmd = SMC_PNETID_DEL, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_del + }, + { + .cmd = SMC_PNETID_FLUSH, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_flush + } +}; + +/* SMC_PNETID family definition */ +static struct genl_family smc_pnet_nl_family = { + .hdrsize = 0, + .name = SMCR_GENL_FAMILY_NAME, + .version = SMCR_GENL_FAMILY_VERSION, + .maxattr = SMC_PNETID_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_pnet_ops, + .n_ops = ARRAY_SIZE(smc_pnet_ops) +}; + +static int smc_pnet_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + smc_pnet_remove_by_ndev(event_dev); + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block smc_netdev_notifier = { + .notifier_call = smc_pnet_netdev_event +}; + +int __init smc_pnet_init(void) +{ + int rc; + + rc = genl_register_family(&smc_pnet_nl_family); + if (rc) + return rc; + rc = register_netdevice_notifier(&smc_netdev_notifier); + if (rc) + genl_unregister_family(&smc_pnet_nl_family); + return rc; +} + +void smc_pnet_exit(void) +{ + smc_pnet_flush(NULL, NULL); + unregister_netdevice_notifier(&smc_netdev_notifier); + genl_unregister_family(&smc_pnet_nl_family); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + struct smc_pnetentry *pnetelem; + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + return; + if (!dst->dev) + goto out_rel; + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (dst->dev == pnetelem->ndev) { + *smcibdev = pnetelem->smcibdev; + *ibport = pnetelem->ib_port; + break; + } + } + read_unlock(&smc_pnettable.lock); +out_rel: + dst_release(dst); +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h new file mode 100644 index 000000000000..32ab3df928ca --- /dev/null +++ b/net/smc/smc_pnet.h @@ -0,0 +1,23 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * PNET table queries + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#ifndef _SMC_PNET_H +#define _SMC_PNET_H + +struct smc_ib_device; + +int smc_pnet_init(void) __init; +void smc_pnet_exit(void); +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); +struct smc_ib_device *smc_pnet_find_ib(char *ib_name); +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport); + +#endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c new file mode 100644 index 000000000000..5d1878732f46 --- /dev/null +++ b/net/smc/smc_rx.c @@ -0,0 +1,217 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * copy new RMBE data into user space + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_cdc.h" +#include "smc_tx.h" /* smc_tx_consumer_update() */ +#include "smc_rx.h" + +/* callback implementation for sk.sk_data_ready() + * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data(). + * indirectly called by smc_cdc_msg_recv_action(). + */ +static void smc_rx_data_ready(struct sock *sk) +{ + struct socket_wq *wq; + + /* derived from sock_def_readable() */ + /* called already in smc_listen_work() */ + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted + * @smc smc socket + * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * Returns: + * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. + * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). + */ +static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int rc; + + if (atomic_read(&conn->bytes_to_rcv)) + return 1; + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + add_wait_queue(sk_sleep(sk), &wait); + rc = sk_wait_event(sk, timeo, + sk->sk_err || + sk->sk_shutdown & RCV_SHUTDOWN || + sock_flag(sk, SOCK_DONE) || + atomic_read(&conn->bytes_to_rcv) || + smc_cdc_rxed_any_close_or_senddone(conn), + &wait); + remove_wait_queue(sk_sleep(sk), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + return rc; +} + +/* rcvbuf consumer: main API called by socket layer. + * called under sk lock. + */ +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, + int flags) +{ + size_t copylen, read_done = 0, read_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + int readable, chunk; + char *rcvbuf_base; + struct sock *sk; + long timeo; + int target; /* Read at least these many bytes */ + int rc; + + if (unlikely(flags & MSG_ERRQUEUE)) + return -EINVAL; /* future work for sk.sk_family == AF_SMC */ + if (flags & MSG_OOB) + return -EINVAL; /* future work */ + + sk = &smc->sk; + if (sk->sk_state == SMC_LISTEN) + return -ENOTCONN; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + msg->msg_namelen = 0; + /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ + rcvbuf_base = conn->rmb_desc->cpu_addr; + + do { /* while (read_remaining) */ + if (read_done >= target) + break; + + if (atomic_read(&conn->bytes_to_rcv)) + goto copy; + + if (read_done) { + if (sk->sk_err || + sk->sk_state == SMC_CLOSED || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + read_done = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + if (sk->sk_state == SMC_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + read_done = -ENOTCONN; + break; + } + break; + } + if (signal_pending(current)) { + read_done = sock_intr_errno(timeo); + break; + } + } + + if (!atomic_read(&conn->bytes_to_rcv)) { + smc_rx_wait_data(smc, &timeo); + continue; + } + +copy: + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_rx_wait_data above */ + readable = atomic_read(&conn->bytes_to_rcv); + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + /* determine chunks where to read from rcvbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, + copylen, conn->rmbe_size - cons.count); + chunk_len_sum = chunk_len; + chunk_off = cons.count; + for (chunk = 0; chunk < 2; chunk++) { + if (!(flags & MSG_TRUNC)) { + rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, + chunk_len); + if (rc) { + if (!read_done) + read_done = -EFAULT; + goto out; + } + } + read_remaining -= chunk_len; + read_done += chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in recv ring buffer */ + } + + /* update cursors */ + if (!(flags & MSG_PEEK)) { + smc_curs_add(conn->rmbe_size, &cons, copylen); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + smp_mb__after_atomic(); + smc_curs_write(&conn->local_tx_ctrl.cons, + smc_curs_read(&cons, conn), + conn); + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn); + } + } while (read_remaining); +out: + return read_done; +} + +/* Initialize receive properties on connection establishment. NB: not __init! */ +void smc_rx_init(struct smc_sock *smc) +{ + smc->sk.sk_data_ready = smc_rx_data_ready; +} diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h new file mode 100644 index 000000000000..b5b80e1f8b0f --- /dev/null +++ b/net/smc/smc_rx.h @@ -0,0 +1,23 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_RX_H +#define SMC_RX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" + +void smc_rx_init(struct smc_sock *smc); +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, + int flags); + +#endif /* SMC_RX_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c new file mode 100644 index 000000000000..6e73b28915ea --- /dev/null +++ b/net/smc/smc_tx.c @@ -0,0 +1,483 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer. + * Producer: + * Copy user space data into send buffer, if send buffer space available. + * Consumer: + * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_tx.h" + +/***************************** sndbuf producer *******************************/ + +/* callback implementation for sk.sk_write_space() + * to wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * called under sk_socket lock. + */ +static void smc_tx_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct smc_sock *smc = smc_sk(sk); + struct socket_wq *wq; + + /* similar to sk_stream_write_space */ + if (atomic_read(&smc->conn.sndbuf_space) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, + POLLOUT | POLLWRNORM | + POLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} + +/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). + */ +void smc_tx_sndbuf_nonfull(struct smc_sock *smc) +{ + if (smc->sk.sk_socket && + test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) + smc->sk.sk_write_space(&smc->sk); +} + +/* blocks sndbuf producer until at least one byte of free space available */ +static int smc_tx_wait_memory(struct smc_sock *smc, int flags) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + bool noblock; + long timeo; + int rc = 0; + + /* similar to sk_stream_wait_memory */ + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + noblock = timeo ? false : true; + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { + rc = -EPIPE; + break; + } + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -ECONNRESET; + break; + } + if (!timeo) { + if (noblock) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + rc = -EAGAIN; + break; + } + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (atomic_read(&conn->sndbuf_space)) + break; /* at least 1 byte of free space available */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, &timeo, + sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + smc_cdc_rxed_any_close_or_senddone(conn) || + atomic_read(&conn->sndbuf_space), + &wait); + sk->sk_write_pending--; + } + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} + +/* sndbuf producer: main API called by socket layer. + * called under sock lock. + */ +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) +{ + size_t copylen, send_done = 0, send_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor prep; + struct sock *sk = &smc->sk; + char *sndbuf_base; + int tx_cnt_prep; + int writespace; + int rc, chunk; + + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + rc = -EPIPE; + goto out_err; + } + + while (msg_data_left(msg)) { + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + if (smc->sk.sk_shutdown & SEND_SHUTDOWN || + (smc->sk.sk_err == ECONNABORTED) || + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; + if (smc_cdc_rxed_any_close(conn)) + return send_done ?: -ECONNRESET; + + if (!atomic_read(&conn->sndbuf_space)) { + rc = smc_tx_wait_memory(smc, msg->msg_flags); + if (rc) { + if (send_done) + return send_done; + goto out_err; + } + continue; + } + + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_tx_wait_memory above */ + writespace = atomic_read(&conn->sndbuf_space); + /* not more than what user space asked for */ + copylen = min_t(size_t, send_remaining, writespace); + /* determine start of sndbuf */ + sndbuf_base = conn->sndbuf_desc->cpu_addr; + smc_curs_write(&prep, + smc_curs_read(&conn->tx_curs_prep, conn), + conn); + tx_cnt_prep = prep.count; + /* determine chunks where to write into sndbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, + copylen, conn->sndbuf_size - tx_cnt_prep); + chunk_len_sum = chunk_len; + chunk_off = tx_cnt_prep; + for (chunk = 0; chunk < 2; chunk++) { + rc = memcpy_from_msg(sndbuf_base + chunk_off, + msg, chunk_len); + if (rc) { + if (send_done) + return send_done; + goto out_err; + } + send_done += chunk_len; + send_remaining -= chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in send ring buffer */ + } + /* update cursors */ + smc_curs_add(conn->sndbuf_size, &prep, copylen); + smc_curs_write(&conn->tx_curs_prep, + smc_curs_read(&prep, conn), + conn); + /* increased in send tasklet smc_cdc_tx_handler() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + smp_mb__after_atomic(); + /* since we just produced more new data into sndbuf, + * trigger sndbuf consumer: RDMA write into peer RMBE and CDC + */ + smc_tx_sndbuf_nonempty(conn); + } /* while (msg_data_left(msg)) */ + + return send_done; + +out_err: + rc = sk_stream_error(sk, msg->msg_flags, rc); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(rc == -EAGAIN)) + sk->sk_write_space(sk); + return rc; +} + +/***************************** sndbuf consumer *******************************/ + +/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ +static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, + int num_sges, struct ib_sge sges[]) +{ + struct smc_link_group *lgr = conn->lgr; + struct ib_send_wr *failed_wr = NULL; + struct ib_rdma_wr rdma_wr; + struct smc_link *link; + int rc; + + memset(&rdma_wr, 0, sizeof(rdma_wr)); + link = &lgr->lnk[SMC_SINGLE_LINK]; + rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link); + rdma_wr.wr.sg_list = sges; + rdma_wr.wr.num_sge = num_sges; + rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; + rdma_wr.remote_addr = + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + /* RMBE within RMB */ + ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + + /* offset within RMBE */ + peer_rmbe_offset; + rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + +/* sndbuf consumer */ +static inline void smc_tx_advance_cursors(struct smc_connection *conn, + union smc_host_cursor *prod, + union smc_host_cursor *sent, + size_t len) +{ + smc_curs_add(conn->peer_rmbe_size, prod, len); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + /* data in flight reduces usable snd_wnd */ + atomic_sub(len, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + smc_curs_add(conn->sndbuf_size, sent, len); +} + +/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; + * usable snd_wnd as max transmit + */ +static int smc_tx_rdma_writes(struct smc_connection *conn) +{ + size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ + size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk; + union smc_host_cursor sent, prep, prod, cons; + struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; + struct smc_link_group *lgr = conn->lgr; + int to_send, rmbespace; + struct smc_link *link; + int num_sges; + int rc; + + /* source: sndbuf */ + smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); + smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + /* cf. wmem_alloc - (snd_max - snd_una) */ + to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); + if (to_send <= 0) + return 0; + + /* destination: RMBE */ + /* cf. snd_wnd */ + rmbespace = atomic_read(&conn->peer_rmbe_space); + if (rmbespace <= 0) + return 0; + smc_curs_write(&prod, + smc_curs_read(&conn->local_tx_ctrl.prod, conn), + conn); + smc_curs_write(&cons, + smc_curs_read(&conn->local_rx_ctrl.cons, conn), + conn); + + /* if usable snd_wnd closes ask peer to advertise once it opens again */ + conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace); + /* cf. usable snd_wnd */ + len = min(to_send, rmbespace); + + /* initialize variables for first iteration of subsequent nested loop */ + link = &lgr->lnk[SMC_SINGLE_LINK]; + dst_off = prod.count; + if (prod.wrap == cons.wrap) { + /* the filled destination area is unwrapped, + * hence the available free destination space is wrapped + * and we need 2 destination chunks of sum len; start with 1st + * which is limited by what's available in sndbuf + */ + dst_len = min_t(size_t, + conn->peer_rmbe_size - prod.count, len); + } else { + /* the filled destination area is wrapped, + * hence the available free destination space is unwrapped + * and we need a single destination chunk of entire len + */ + dst_len = len; + } + dst_len_sum = dst_len; + src_off = sent.count; + /* dst_len determines the maximum src_len */ + if (sent.count + dst_len <= conn->sndbuf_size) { + /* unwrapped src case: single chunk of entire dst_len */ + src_len = dst_len; + } else { + /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ + src_len = conn->sndbuf_size - sent.count; + } + src_len_sum = src_len; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sges[srcchunk].addr = + conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] + + src_off; + sges[srcchunk].length = src_len; + sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + num_sges++; + src_off += src_len; + if (src_off >= conn->sndbuf_size) + src_off -= conn->sndbuf_size; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, + dst_len, conn->sndbuf_size - sent.count); + src_len_sum = src_len; + } + + smc_tx_advance_cursors(conn, &prod, &sent, len); + /* update connection's cursors with advanced local cursors */ + smc_curs_write(&conn->local_tx_ctrl.prod, + smc_curs_read(&prod, conn), + conn); + /* dst: peer RMBE */ + smc_curs_write(&conn->tx_curs_sent, + smc_curs_read(&sent, conn), + conn); + /* src: local sndbuf */ + + return 0; +} + +/* Wakeup sndbuf consumers from any context (IRQ or process) + * since there is more data to transmit; usable snd_wnd as max transmit + */ +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int rc; + + spin_lock_bh(&conn->send_lock); + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, + &pend); + if (rc < 0) { + if (rc == -EBUSY) { + struct smc_sock *smc = + container_of(conn, struct smc_sock, conn); + + if (smc->sk.sk_err == ECONNABORTED) { + rc = sock_error(&smc->sk); + goto out_unlock; + } + rc = 0; + schedule_work(&conn->tx_work); + } + goto out_unlock; + } + + rc = smc_tx_rdma_writes(conn); + if (rc) { + smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + (struct smc_wr_tx_pend_priv *)pend); + goto out_unlock; + } + + rc = smc_cdc_msg_send(conn, wr_buf, pend); + +out_unlock: + spin_unlock_bh(&conn->send_lock); + return rc; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit + */ +static void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_sndbuf_nonempty(conn); + release_sock(&smc->sk); +} + +void smc_tx_consumer_update(struct smc_connection *conn) +{ + union smc_host_cursor cfed, cons; + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int to_confirm, rc; + + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_curs_write(&cfed, + smc_curs_read(&conn->rx_curs_confirmed, conn), + conn); + to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); + + if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + ((to_confirm > conn->rmbe_update_limit) && + ((to_confirm > (conn->rmbe_size / 2)) || + conn->local_rx_ctrl.prod_flags.write_blocked))) { + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + &wr_buf, &pend); + if (!rc) + rc = smc_cdc_msg_send(conn, wr_buf, pend); + if (rc < 0) { + schedule_work(&conn->tx_work); + return; + } + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } + if (conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/***************************** send initialize *******************************/ + +/* Initialize send properties on connection establishment. NB: not __init! */ +void smc_tx_init(struct smc_sock *smc) +{ + smc->sk.sk_write_space = smc_tx_write_space; + INIT_WORK(&smc->conn.tx_work, smc_tx_work); + spin_lock_init(&smc->conn.send_lock); +} diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h new file mode 100644 index 000000000000..1d6a0dcdcfe6 --- /dev/null +++ b/net/smc/smc_tx.h @@ -0,0 +1,35 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_TX_H +#define SMC_TX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" +#include "smc_cdc.h" + +static inline int smc_tx_prepared_sends(struct smc_connection *conn) +{ + union smc_host_cursor sent, prep; + + smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); + smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + return smc_curs_diff(conn->sndbuf_size, &sent, &prep); +} + +void smc_tx_init(struct smc_sock *smc); +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sndbuf_nonempty(struct smc_connection *conn); +void smc_tx_sndbuf_nonfull(struct smc_sock *smc); +void smc_tx_consumer_update(struct smc_connection *conn); + +#endif /* SMC_TX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c new file mode 100644 index 000000000000..eadf157418dc --- /dev/null +++ b/net/smc/smc_wr.c @@ -0,0 +1,614 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Work requests (WR) of type ib_post_send or ib_post_recv respectively + * are submitted to either RC SQ or RC RQ respectively + * (reliably connected send/receive queue) + * and become work queue entries (WQEs). + * While an SQ WR/WQE is pending, we track it until transmission completion. + * Through a send or receive completion queue (CQ) respectively, + * we get completion queue entries (CQEs) [aka work completions (WCs)]. + * Since the CQ callback is called from IRQ context, we split work by using + * bottom halves implemented by tasklets. + * + * SMC uses this to exchange LLC (link layer control) + * and CDC (connection data control) messages. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#include <linux/atomic.h> +#include <linux/hashtable.h> +#include <linux/wait.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_wr.h" + +#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ + +#define SMC_WR_RX_HASH_BITS 4 +static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); +static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); + +struct smc_wr_tx_pend { /* control data for a pending send request */ + u64 wr_id; /* work request id sent */ + smc_wr_tx_handler handler; + enum ib_wc_status wc_status; /* CQE status */ + struct smc_link *link; + u32 idx; + struct smc_wr_tx_pend_priv priv; +}; + +/******************************** send queue *********************************/ + +/*------------------------------- completion --------------------------------*/ + +static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) +{ + u32 i; + + for (i = 0; i < link->wr_tx_cnt; i++) { + if (link->wr_tx_pends[i].wr_id == wr_id) + return i; + } + return link->wr_tx_cnt; +} + +static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) +{ + struct smc_wr_tx_pend pnd_snd; + struct smc_link *link; + u32 pnd_snd_idx; + int i; + + link = wc->qp->qp_context; + pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); + if (pnd_snd_idx == link->wr_tx_cnt) + return; + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + if (wc->status) { + struct smc_link_group *lgr; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + /* clear full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[i], 0, + sizeof(link->wr_tx_pends[i])); + memset(&link->wr_tx_bufs[i], 0, + sizeof(link->wr_tx_bufs[i])); + clear_bit(i, link->wr_tx_mask); + } + /* terminate connections of this link group abnormally */ + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + } + if (pnd_snd.handler) + pnd_snd.handler(&pnd_snd.priv, link, wc->status); + wake_up(&link->wr_tx_wait); +} + +static void smc_wr_tx_tasklet_fn(unsigned long data) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int i = 0, rc; + int polled = 0; + +again: + polled++; + do { + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + for (i = 0; i < rc; i++) + smc_wr_tx_process_cqe(&wc[i]); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->send_tasklet); +} + +/*---------------------------- request submission ---------------------------*/ + +static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) +{ + *idx = link->wr_tx_cnt; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { + if (!test_and_set_bit(*idx, link->wr_tx_mask)) + return 0; + } + *idx = link->wr_tx_cnt; + return -EBUSY; +} + +/** + * smc_wr_tx_get_free_slot() - returns buffer for message assembly, + * and sets info for pending transmit tracking + * @link: Pointer to smc_link used to later send the message. + * @handler: Send completion handler function pointer. + * @wr_buf: Out value returns pointer to message buffer. + * @wr_pend_priv: Out value returns pointer serving as handler context. + * + * Return: 0 on success, or -errno on error. + */ +int smc_wr_tx_get_free_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + u32 idx; + int rc; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + if (in_softirq()) { + rc = smc_wr_tx_get_free_slot_index(link, &idx); + if (rc) + return rc; + } else { + rc = wait_event_interruptible_timeout( + link->wr_tx_wait, + (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), + SMC_WR_TX_WAIT_FREE_SLOT_TIME); + if (!rc) { + /* timeout - terminate connections */ + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + return -EPIPE; + } + if (rc == -ERESTARTSYS) + return -EINTR; + if (idx == link->wr_tx_cnt) + return -EPIPE; + } + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = &link->wr_tx_pends[idx]; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = idx; + wr_ib = &link->wr_tx_ibs[idx]; + wr_ib->wr_id = wr_id; + *wr_buf = &link->wr_tx_bufs[idx]; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv) +{ + struct smc_wr_tx_pend *pend; + + pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); + if (pend->idx < link->wr_tx_cnt) { + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pend->idx], 0, + sizeof(link->wr_tx_pends[pend->idx])); + memset(&link->wr_tx_bufs[pend->idx], 0, + sizeof(link->wr_tx_bufs[pend->idx])); + test_and_clear_bit(pend->idx, link->wr_tx_mask); + return 1; + } + + return 0; +} + +/* Send prepared WR slot via ib_post_send. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) +{ + struct ib_send_wr *failed_wr = NULL; + struct smc_wr_tx_pend *pend; + int rc; + + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + pend = container_of(priv, struct smc_wr_tx_pend, priv); + rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], + &failed_wr); + if (rc) + smc_wr_tx_put_slot(link, priv); + return rc; +} + +void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data) +{ + struct smc_wr_tx_pend_priv *tx_pend; + struct smc_wr_rx_hdr *wr_rx; + int i; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; + if (wr_rx->type != wr_rx_hdr_type) + continue; + tx_pend = &link->wr_tx_pends[i].priv; + if (filter(tx_pend, data)) + dismisser(tx_pend); + } +} + +bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, unsigned long data) +{ + struct smc_wr_tx_pend_priv *tx_pend; + struct smc_wr_rx_hdr *wr_rx; + int i; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; + if (wr_rx->type != wr_rx_hdr_type) + continue; + tx_pend = &link->wr_tx_pends[i].priv; + if (filter(tx_pend, data)) + return true; + } + return false; +} + +/****************************** receive queue ********************************/ + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) +{ + struct smc_wr_rx_handler *h_iter; + int rc = 0; + + spin_lock(&smc_wr_rx_hash_lock); + hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { + if (h_iter->type == handler->type) { + rc = -EEXIST; + goto out_unlock; + } + } + hash_add(smc_wr_rx_hash, &handler->list, handler->type); +out_unlock: + spin_unlock(&smc_wr_rx_hash_lock); + return rc; +} + +/* Demultiplex a received work request based on the message type to its handler. + * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, + * and not being modified any more afterwards so we don't need to lock it. + */ +static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_wr_rx_handler *handler; + struct smc_wr_rx_hdr *wr_rx; + u64 temp_wr_id; + u32 index; + + if (wc->byte_len < sizeof(*wr_rx)) + return; /* short message */ + temp_wr_id = wc->wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { + if (handler->type == wr_rx->type) + handler->handler(wc, wr_rx); + } +} + +static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +{ + struct smc_link *link; + int i; + + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; + if (wc[i].status == IB_WC_SUCCESS) { + smc_wr_rx_demultiplex(&wc[i]); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + struct smc_link_group *lgr; + + /* handle status errors */ + switch (wc[i].status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + /* terminate connections of this link group + * abnormally + */ + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ + break; + } + } + } +} + +static void smc_wr_rx_tasklet_fn(unsigned long data) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int polled = 0; + int rc; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + smc_wr_rx_process_cqes(&wc[0], rc); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->recv_tasklet); +} + +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + return rc; +} + +/***************************** init, exit, misc ******************************/ + +void smc_wr_remember_qp_attr(struct smc_link *lnk) +{ + struct ib_qp_attr *attr = &lnk->qp_attr; + struct ib_qp_init_attr init_attr; + + memset(attr, 0, sizeof(*attr)); + memset(&init_attr, 0, sizeof(init_attr)); + ib_query_qp(lnk->roce_qp, attr, + IB_QP_STATE | + IB_QP_CUR_STATE | + IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_RQ_PSN | + IB_QP_ALT_PATH | + IB_QP_MIN_RNR_TIMER | + IB_QP_SQ_PSN | + IB_QP_PATH_MIG_STATE | + IB_QP_CAP | + IB_QP_DEST_QPN, + &init_attr); + + lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->qp_attr.cap.max_send_wr); + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->qp_attr.cap.max_recv_wr); +} + +static void smc_wr_init_sge(struct smc_link *lnk) +{ + u32 i; + + for (i = 0; i < lnk->wr_tx_cnt; i++) { + lnk->wr_tx_sges[i].addr = + lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; + lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_ibs[i].next = NULL; + lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; + lnk->wr_tx_ibs[i].num_sge = 1; + lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; + lnk->wr_tx_ibs[i].send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE; + } + for (i = 0; i < lnk->wr_rx_cnt; i++) { + lnk->wr_rx_sges[i].addr = + lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_rx_ibs[i].next = NULL; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; + lnk->wr_rx_ibs[i].num_sge = 1; + } +} + +void smc_wr_free_link(struct smc_link *lnk) +{ + struct ib_device *ibdev; + + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + + if (!lnk->smcibdev) + return; + ibdev = lnk->smcibdev->ibdev; + + if (lnk->wr_rx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; + } + if (lnk->wr_tx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + lnk->wr_tx_dma_addr = 0; + } +} + +void smc_wr_free_link_mem(struct smc_link *lnk) +{ + kfree(lnk->wr_tx_pends); + lnk->wr_tx_pends = NULL; + kfree(lnk->wr_tx_mask); + lnk->wr_tx_mask = NULL; + kfree(lnk->wr_tx_sges); + lnk->wr_tx_sges = NULL; + kfree(lnk->wr_rx_sges); + lnk->wr_rx_sges = NULL; + kfree(lnk->wr_rx_ibs); + lnk->wr_rx_ibs = NULL; + kfree(lnk->wr_tx_ibs); + lnk->wr_tx_ibs = NULL; + kfree(lnk->wr_tx_bufs); + lnk->wr_tx_bufs = NULL; + kfree(lnk->wr_rx_bufs); + lnk->wr_rx_bufs = NULL; +} + +int smc_wr_alloc_link_mem(struct smc_link *link) +{ + /* allocate link related memory */ + link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + if (!link->wr_tx_bufs) + goto no_mem; + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + GFP_KERNEL); + if (!link->wr_rx_bufs) + goto no_mem_wr_tx_bufs; + link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), + GFP_KERNEL); + if (!link->wr_tx_ibs) + goto no_mem_wr_rx_bufs; + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_ibs[0]), + GFP_KERNEL); + if (!link->wr_rx_ibs) + goto no_mem_wr_tx_ibs; + link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), + GFP_KERNEL); + if (!link->wr_tx_sges) + goto no_mem_wr_rx_ibs; + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_sges[0]), + GFP_KERNEL); + if (!link->wr_rx_sges) + goto no_mem_wr_tx_sges; + link->wr_tx_mask = kzalloc( + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask), + GFP_KERNEL); + if (!link->wr_tx_mask) + goto no_mem_wr_rx_sges; + link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_pends[0]), + GFP_KERNEL); + if (!link->wr_tx_pends) + goto no_mem_wr_tx_mask; + return 0; + +no_mem_wr_tx_mask: + kfree(link->wr_tx_mask); +no_mem_wr_rx_sges: + kfree(link->wr_rx_sges); +no_mem_wr_tx_sges: + kfree(link->wr_tx_sges); +no_mem_wr_rx_ibs: + kfree(link->wr_rx_ibs); +no_mem_wr_tx_ibs: + kfree(link->wr_tx_ibs); +no_mem_wr_rx_bufs: + kfree(link->wr_rx_bufs); +no_mem_wr_tx_bufs: + kfree(link->wr_tx_bufs); +no_mem: + return -ENOMEM; +} + +void smc_wr_remove_dev(struct smc_ib_device *smcibdev) +{ + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); +} + +void smc_wr_add_dev(struct smc_ib_device *smcibdev) +{ + tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn, + (unsigned long)smcibdev); + tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn, + (unsigned long)smcibdev); +} + +int smc_wr_create_link(struct smc_link *lnk) +{ + struct ib_device *ibdev = lnk->smcibdev->ibdev; + int rc = 0; + + smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); + lnk->wr_rx_id = 0; + lnk->wr_rx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { + lnk->wr_rx_dma_addr = 0; + rc = -EIO; + goto out; + } + lnk->wr_tx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { + rc = -EIO; + goto dma_unmap; + } + smc_wr_init_sge(lnk); + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + return rc; + +dma_unmap: + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; +out: + return rc; +} diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h new file mode 100644 index 000000000000..0b9beeda6053 --- /dev/null +++ b/net/smc/smc_wr.h @@ -0,0 +1,106 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#ifndef SMC_WR_H +#define SMC_WR_H + +#include <linux/atomic.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_core.h" + +#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */ +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ + +#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) +#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) + +#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ + +#define SMC_WR_TX_PEND_PRIV_SIZE 32 + +struct smc_wr_tx_pend_priv { + u8 priv[SMC_WR_TX_PEND_PRIV_SIZE]; +}; + +typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *, + struct smc_link *, + enum ib_wc_status); + +typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *, + unsigned long); + +typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *); + +struct smc_wr_rx_handler { + struct hlist_node list; /* hash table collision resolution */ + void (*handler)(struct ib_wc *, void *); + u8 type; +}; + +/* Only used by RDMA write WRs. + * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly + */ +static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) +{ + return atomic_long_inc_return(&link->wr_tx_id); +} + +static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) +{ + atomic_long_set(wr_tx_id, val); +} + +/* post a new receive work request to fill a completed old work request entry */ +static inline int smc_wr_rx_post(struct smc_link *link) +{ + struct ib_recv_wr *bad_recv_wr = NULL; + int rc; + u64 wr_id, temp_wr_id; + u32 index; + + wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + temp_wr_id = wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + link->wr_rx_ibs[index].wr_id = wr_id; + rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr); + return rc; +} + +int smc_wr_create_link(struct smc_link *lnk); +int smc_wr_alloc_link_mem(struct smc_link *lnk); +void smc_wr_free_link(struct smc_link *lnk); +void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_remember_qp_attr(struct smc_link *lnk); +void smc_wr_remove_dev(struct smc_ib_device *smcibdev); +void smc_wr_add_dev(struct smc_ib_device *smcibdev); + +int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, unsigned long data); +void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data); + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); +int smc_wr_rx_post_init(struct smc_link *link); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); + +#endif /* SMC_WR_H */ diff --git a/net/socket.c b/net/socket.c index 8487bf136e5c..2c1e8677ff2d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -287,7 +287,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static void init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), @@ -296,9 +296,7 @@ static int init_inodecache(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); - if (sock_inode_cachep == NULL) - return -ENOMEM; - return 0; + BUG_ON(sock_inode_cachep == NULL); } static const struct super_operations sockfs_ops = { @@ -533,11 +531,11 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); - if (!err) { + if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); sock->sk->sk_uid = iattr->ia_uid; @@ -1948,6 +1946,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { + BUILD_BUG_ON(sizeof(struct cmsghdr) != + CMSG_ALIGN(sizeof(struct cmsghdr))); if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) @@ -2228,8 +2228,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, return err; err = sock_error(sock->sk); - if (err) + if (err) { + datagrams = err; goto out_put; + } entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index cdeb1d814833..4f16953e4954 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -763,7 +763,7 @@ err_put_ctx: err: kfree(buf); out: - dprintk("RPC: %s returning %Zd\n", __func__, err); + dprintk("RPC: %s returning %zd\n", __func__, err); return err; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index dc6fb79a361f..25d9a9cf7b66 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -260,7 +260,7 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, if (!oa->data) return -ENOMEM; - creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); + creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { kfree(oa->data); return -ENOMEM; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 886e9d381771..a54a7a3d28f5 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = get_seconds(); - set_bit(CACHE_NEGATIVE, &rsci->h.flags); + /* Delete the entry from the cache_list and call cache_put */ + sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; svc_putnl(resv, RPC_SUCCESS); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8147e8d56eb2..d8639da06d9c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries) { - write_unlock(&cd->hash_lock); - spin_unlock(&cache_list_lock); - goto out; - } if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); @@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return; -out: - printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); @@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - time_t now = seconds_since_boot(); - if (detail->flush_time >= now) - now = detail->flush_time + 1; - /* 'now' is the maximum value any 'last_refresh' can have */ - detail->flush_time = now; - detail->nextcheck = seconds_since_boot(); - cache_flush(); + struct cache_head *ch = NULL; + struct hlist_head *head = NULL; + struct hlist_node *tmp = NULL; + int i = 0; + + write_lock(&detail->hash_lock); + if (!detail->entries) { + write_unlock(&detail->hash_lock); + return; + } + + dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); + for (i = 0; i < detail->hash_size; i++) { + head = &detail->hash_table[i]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { + hlist_del_init(&ch->cache_list); + detail->entries--; + + set_bit(CACHE_CLEANED, &ch->flags); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(ch, detail); + cache_put(ch, detail); + write_lock(&detail->hash_lock); + } + } + write_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1358,7 +1369,7 @@ static int c_show(struct seq_file *m, void *p) ifdebug(CACHE) seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), - atomic_read(&cp->ref.refcount), cp->flags); + kref_read(&cp->ref), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ @@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); +void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) +{ + write_lock(&cd->hash_lock); + if (!hlist_unhashed(&h->cache_list)){ + hlist_del_init(&h->cache_list); + cd->entries--; + write_unlock(&cd->hash_lock); + cache_put(h, cd); + } else + write_unlock(&cd->hash_lock); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1efbe48e794f..1dc9f3bac099 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -336,6 +336,11 @@ out: static DEFINE_IDA(rpc_clids); +void rpc_cleanup_clids(void) +{ + ida_destroy(&rpc_clids); +} + static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index d1c330a7953a..c73de181467a 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -119,6 +119,7 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_cleanup_clids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 75f290bddca1..b94efd93d3e4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv) for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - if (progp->pg_vers[i]->vs_hidden == 0) + if (!progp->pg_vers[i]->vs_hidden) return 1; } } @@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net, if (vers->vs_hidden) continue; + /* + * Don't register a UDP port if we need congestion + * control. + */ + if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) + continue; + error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); @@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) !(versp = progp->pg_vers[vers])) goto err_bad_vers; + /* + * Some protocol versions (namely NFSv4) require some form of + * congestion control. (See RFC 7530 section 3.1 paragraph 2) + * In other words, UDP is not allowed. We mark those when setting + * up the svc_xprt, and verify that here. + * + * The spec is not very clear about what error should be returned + * when someone tries to access a server that is listening on UDP + * for lower versions. RPC_PROG_MISMATCH seems to be the closest + * fit. + */ + if (versp->vs_need_cong_ctrl && + !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) + goto err_bad_vers; + procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) goto err_bad_proc; @@ -1260,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) return 0; err_short_len: - svc_printk(rqstp, "short len %Zd, dropping request\n", + svc_printk(rqstp, "short len %zd, dropping request\n", argv->iov_len); goto close; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3bc1d61694cb..7bfe1fb42add 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -490,7 +490,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) svc_xprt_get(xprt); dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, atomic_read(&xprt->xpt_ref.refcount)); + xprt, kref_read(&xprt->xpt_ref)); } spin_unlock_bh(&pool->sp_lock); out: @@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); + if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags)) + xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ goto out; @@ -820,7 +822,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); @@ -978,7 +980,7 @@ static void svc_age_temp_xprts(unsigned long closure) * through, close it. */ if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) continue; - if (atomic_read(&xprt->xpt_ref.refcount) > 1 || + if (kref_read(&xprt->xpt_ref) > 1 || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; list_del_init(le); @@ -1020,9 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) le = to_be_closed.next; list_del_init(le); xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); - xprt->xpt_ops->xpo_kill_temp_xprt(xprt); - svc_close_xprt(xprt); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_KILL_TEMP, &xprt->xpt_flags); + dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n", + xprt); + svc_xprt_enqueue(xprt); } } EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index e112da8005b5..bb8db3cb8032 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -126,13 +126,18 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister); static struct hlist_head auth_domain_table[DN_HASHMAX]; static DEFINE_SPINLOCK(auth_domain_lock); +static void auth_domain_release(struct kref *kref) +{ + struct auth_domain *dom = container_of(kref, struct auth_domain, ref); + + hlist_del(&dom->hash); + dom->flavour->domain_release(dom); + spin_unlock(&auth_domain_lock); +} + void auth_domain_put(struct auth_domain *dom) { - if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) { - hlist_del(&dom->hash); - dom->flavour->domain_release(dom); - spin_unlock(&auth_domain_lock); - } + kref_put_lock(&dom->ref, auth_domain_release, &auth_domain_lock); } EXPORT_SYMBOL_GPL(auth_domain_put); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index de066acdb34e..8931e33b6541 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) rqstp->rq_respages[0], tailoff); out: - dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", + dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n", svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); @@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, &svsk->sk_xprt, serv); set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 288e35c2d8f4..ff1df40f0d26 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -4,6 +4,7 @@ * Support for backward direction RPCs on RPC/RDMA (server-side). */ +#include <linux/module.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" @@ -200,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + __be32 *p; int rc; /* Space in the send buffer for an RPC/RDMA header is reserved * via xprt->tsh_size. */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + p = rqst->rq_buffer; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 0ba9887f3e22..1c4aabf0f657 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,102 +48,43 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Decodes a read chunk list. The expected format is as follows: - * descrim : xdr_one - * position : __be32 offset into XDR stream - * handle : __be32 RKEY - * . . . - * end-of-list: xdr_zero - */ -static __be32 *decode_read_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) { - struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + __be32 *next; - while (ch->rc_discrim != xdr_zero) { - if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > - (unsigned long)vaend) { - dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + while (*p++ != xdr_zero) { + next = p + rpcrdma_readchunk_maxsz - 1; + if (next > end) return NULL; - } - ch++; + p = next; } - return &ch->rc_position; + return p; } -/* - * Decodes a write chunk list. The expected format is as follows: - * descrim : xdr_one - * nchunks : <count> - * handle : __be32 RKEY ---+ - * length : __be32 <len of segment> | - * offset : remove va + <count> - * . . . | - * ---+ - */ -static __be32 *decode_write_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; + __be32 *next; - /* Check for not write-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + while (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - return &ary->wc_array[nchunks].wc_target.rs_length; + return p; } -static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; - - /* Check for no reply-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + __be32 *next; + + if (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - return (__be32 *)&ary->wc_array[nchunks]; + return p; } /** @@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) */ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - struct rpcrdma_msg *rmsgp; - __be32 *va, *vaend; - unsigned int len; - u32 hdr_len; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { - dprintk("svcrdma: header too short = %d\n", - rq_arg->len); - return -EINVAL; - } + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; - rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; - if (rmsgp->rm_vers != rpcrdma_version) { - dprintk("%s: bad version %u\n", __func__, - be32_to_cpu(rmsgp->rm_vers)); - return -EPROTONOSUPPORT; - } + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; - switch (be32_to_cpu(rmsgp->rm_type)) { - case RDMA_MSG: - case RDMA_NOMSG: + switch (*(rdma_argp + 3)) { + case rdma_msg: + case rdma_nomsg: break; - case RDMA_DONE: - /* Just drop it */ - dprintk("svcrdma: dropping RDMA_DONE message\n"); - return 0; - - case RDMA_ERROR: - /* Possible if this is a backchannel reply. - * XXX: We should cancel this XID, though. - */ - dprintk("svcrdma: dropping RDMA_ERROR message\n"); - return 0; - - case RDMA_MSGP: - /* Pull in the extra for the padded case, bump our pointer */ - rmsgp->rm_body.rm_padded.rm_align = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); - rmsgp->rm_body.rm_padded.rm_thresh = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); - - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rq_arg->head[0].iov_base = va; - len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rq_arg->head[0].iov_len -= len; - if (len > rq_arg->len) - return -EINVAL; - return len; - default: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpu(rmsgp->rm_type)); - return -EINVAL; - } + case rdma_done: + goto out_drop; - /* The chunk list may contain either a read chunk list or a write - * chunk list and a reply chunk list. - */ - va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); - va = decode_read_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode read list\n"); - return -EINVAL; - } - va = decode_write_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode write list\n"); - return -EINVAL; - } - va = decode_reply_array(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode reply chunk\n"); - return -EINVAL; + case rdma_error: + goto out_drop; + + default: + goto out_proc; } - rq_arg->head[0].iov_base = va; - hdr_len = (unsigned long)va - (unsigned long)rmsgp; + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; + +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; + +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; + +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, @@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, *va++ = rmsgp->rm_xid; *va++ = rmsgp->rm_vers; - *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = xprt->sc_fc_credits; *va++ = rdma_error; *va++ = cpu_to_be32(err); if (err == ERR_VERS) { @@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, return (int)((unsigned long)va - (unsigned long)startp); } -int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +/** + * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header + * @rdma_resp: buffer containing Reply transport header + * + * Returns length of transport header, in bytes. + */ +unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) { - struct rpcrdma_write_array *wr_ary; + unsigned int nsegs; + __be32 *p; - /* There is no read-list in a reply */ + p = rdma_resp; - /* skip write list */ - wr_ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. - wc_target.rs_length; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - /* skip reply array */ - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - return (unsigned long) wr_ary - (unsigned long) rmsgp; + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; } void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) @@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, seg->rs_offset = rs_offset; seg->rs_length = cpu_to_be32(write_len); } - -void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, - struct rpcrdma_msg *rdma_resp, - enum rpcrdma_proc rdma_type) -{ - rdma_resp->rm_xid = rdma_argp->rm_xid; - rdma_resp->rm_vers = rdma_argp->rm_vers; - rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); - rdma_resp->rm_type = cpu_to_be32(rdma_type); - - /* Encode <nul> chunks lists */ - rdma_resp->rm_body.rm_chunks[0] = xdr_zero; - rdma_resp->rm_body.rm_chunks[1] = xdr_zero; - rdma_resp->rm_body.rm_chunks[2] = xdr_zero; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 57d35fbb1c28..f7b2daf72a86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -347,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, atomic_inc(&rdma_stat_read); return ret; err: - ib_dma_unmap_sg(xprt->sc_cm_id->device, - frmr->sg, frmr->sg_nents, frmr->direction); svc_rdma_put_context(ctxt, 0); svc_rdma_put_frmr(xprt, frmr); return ret; @@ -608,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); } else { atomic_inc(&rdma_stat_rq_starve); clear_bit(XPT_DATA, &xprt->xpt_flags); ctxt = NULL; } - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); if (!ctxt) { /* This is the EAGAIN path. The svc_recv routine will * return -EAGAIN, the nfsd thread will go to call into diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ad4d286a83c5..515221b16d09 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].length = + svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, ctxt->sge[0].length, DMA_TO_DEVICE); @@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; struct rpcrdma_write_array *wr_ary, *rp_ary; - enum rpcrdma_proc reply_type; int ret; int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; u32 inv_rkey; + __be32 *p; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - if (rp_ary) - reply_type = RDMA_NOMSG; - else - reply_type = RDMA_MSG; - svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, - rdma_resp, reply_type); + + p = &rdma_resp->rm_xid; + *p++ = rdma_argp->rm_xid; + *p++ = rdma_argp->rm_vers; + *p++ = rdma->sc_fc_credits; + *p++ = rp_ary ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; /* Send any write-chunk data and build resp write-list */ if (wr_ary) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ca2799af05a6..c13a5c35ce14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, ctxt = kmalloc(sizeof(*ctxt), flags); if (ctxt) { ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->free); - INIT_LIST_HEAD(&ctxt->dto_q); + INIT_LIST_HEAD(&ctxt->list); } return ctxt; } @@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) dprintk("svcrdma: No memory for RDMA ctxt\n"); return false; } - list_add(&ctxt->free, &xprt->sc_ctxts); + list_add(&ctxt->list, &xprt->sc_ctxts); } return true; } @@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used++; if (list_empty(&xprt->sc_ctxts)) goto out_empty; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del_init(&ctxt->free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&xprt->sc_ctxt_lock); out: ctxt->count = 0; @@ -209,15 +208,15 @@ out_empty: /* Either pre-allocation missed the mark, or send * queue accounting is broken. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); ctxt = alloc_ctxt(xprt, GFP_NOIO); if (ctxt) goto out; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); return NULL; } @@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - list_add(&ctxt->free, &xprt->sc_ctxts); - spin_unlock_bh(&xprt->sc_ctxt_lock); + list_add(&ctxt->list, &xprt->sc_ctxts); + spin_unlock(&xprt->sc_ctxt_lock); } static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) @@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del(&ctxt->free); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); kfree(ctxt); } } @@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->byte_len = wc->byte_len; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); @@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) read_hdr = ctxt->read_hdr; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->dto_q, + list_add_tail(&read_hdr->list, &xprt->sc_read_complete_q); spin_unlock(&xprt->sc_rq_dto_lock); @@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return NULL; svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); - INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); @@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_map_lock); + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) { struct svc_rdma_fastreg_mr *frmr = NULL; - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); if (!list_empty(&rdma->sc_frmr_q)) { frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); frmr->sg_nents = 0; } - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); if (frmr) return frmr; @@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); } } @@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + @@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1201,9 +1208,9 @@ static void __svc_rdma_free(struct work_struct *work) ib_drain_qp(rdma->sc_qp); /* We should only be called from kref_put */ - if (atomic_read(&xprt->xpt_ref.refcount) != 0) + if (kref_read(&xprt->xpt_ref) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); /* * Destroy queued, but not processed read completions. Note @@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work) */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index af392d9b9cec..956c7bce80d1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1188,7 +1188,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r char *p; len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%Zu bytes)\n", len); + dprintk("RPC: reading XID (%zu bytes)\n", len); p = ((char *) &transport->tcp_xid) + transport->tcp_offset; used = xdr_skb_read_bits(desc, p, len); transport->tcp_offset += used; @@ -1219,7 +1219,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, */ offset = transport->tcp_offset - sizeof(transport->tcp_xid); len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len); + dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); p = ((char *) &transport->tcp_calldir) + offset; used = xdr_skb_read_bits(desc, p, len); transport->tcp_offset += used; @@ -1310,7 +1310,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, return; } - dprintk("RPC: XID %08x read %Zd bytes\n", + dprintk("RPC: XID %08x read %zd bytes\n", ntohl(transport->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " "tcp_reclen = %u\n", xprt, transport->tcp_copied, @@ -1456,7 +1456,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s desc->count -= len; desc->offset += len; transport->tcp_offset += len; - dprintk("RPC: discarded %Zu bytes\n", len); + dprintk("RPC: discarded %zu bytes\n", len); xs_tcp_check_fraghdr(transport); } diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index aa1babbea385..7d99029df342 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -1,7 +1,7 @@ /* * net/tipc/bcast.c: TIPC broadcast code * - * Copyright (c) 2004-2006, 2014-2015, Ericsson AB + * Copyright (c) 2004-2006, 2014-2016, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. @@ -39,9 +39,8 @@ #include "socket.h" #include "msg.h" #include "bcast.h" -#include "name_distr.h" #include "link.h" -#include "node.h" +#include "name_table.h" #define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ #define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ @@ -54,12 +53,20 @@ const char tipc_bclink_name[] = "broadcast-link"; * @inputq: data input queue; will only carry SOCK_WAKEUP messages * @dest: array keeping number of reachable destinations per bearer * @primary_bearer: a bearer having links to all broadcast destinations, if any + * @bcast_support: indicates if primary bearer, if any, supports broadcast + * @rcast_support: indicates if all peer nodes support replicast + * @rc_ratio: dest count as percentage of cluster size where send method changes + * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast */ struct tipc_bc_base { struct tipc_link *link; struct sk_buff_head inputq; int dests[MAX_BEARERS]; int primary_bearer; + bool bcast_support; + bool rcast_support; + int rc_ratio; + int bc_threshold; }; static struct tipc_bc_base *tipc_bc_base(struct net *net) @@ -69,7 +76,20 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net) int tipc_bcast_get_mtu(struct net *net) { - return tipc_link_mtu(tipc_bc_sndlink(net)); + return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE; +} + +void tipc_bcast_disable_rcast(struct net *net) +{ + tipc_bc_base(net)->rcast_support = false; +} + +static void tipc_bcbase_calc_bc_threshold(struct net *net) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net)); + + bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100); } /* tipc_bcbase_select_primary(): find a bearer with links to all destinations, @@ -79,9 +99,10 @@ static void tipc_bcbase_select_primary(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int all_dests = tipc_link_bc_peers(bb->link); - int i, mtu; + int i, mtu, prim; bb->primary_bearer = INVALID_BEARER_ID; + bb->bcast_support = true; if (!all_dests) return; @@ -93,7 +114,7 @@ static void tipc_bcbase_select_primary(struct net *net) mtu = tipc_bearer_mtu(net, i); if (mtu < tipc_link_mtu(bb->link)) tipc_link_set_mtu(bb->link, mtu); - + bb->bcast_support &= tipc_bearer_bcast_support(net, i); if (bb->dests[i] < all_dests) continue; @@ -103,6 +124,9 @@ static void tipc_bcbase_select_primary(struct net *net) if ((i ^ tipc_own_addr(net)) & 1) break; } + prim = bb->primary_bearer; + if (prim != INVALID_BEARER_ID) + bb->bcast_support = tipc_bearer_bcast_support(net, prim); } void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) @@ -170,45 +194,131 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) __skb_queue_purge(&_xmitq); } -/* tipc_bcast_xmit - deliver buffer chain to all nodes in cluster - * and to identified node local sockets +static void tipc_bcast_select_xmit_method(struct net *net, int dests, + struct tipc_mc_method *method) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + unsigned long exp = method->expires; + + /* Broadcast supported by used bearer/bearers? */ + if (!bb->bcast_support) { + method->rcast = true; + return; + } + /* Any destinations which don't support replicast ? */ + if (!bb->rcast_support) { + method->rcast = false; + return; + } + /* Can current method be changed ? */ + method->expires = jiffies + TIPC_METHOD_EXPIRE; + if (method->mandatory || time_before(jiffies, exp)) + return; + + /* Determine method to use now */ + method->rcast = dests <= bb->bc_threshold; +} + +/* tipc_bcast_xmit - broadcast the buffer chain to all external nodes * @net: the applicable net namespace - * @list: chain of buffers containing message - * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + * @pkts: chain of buffers containing message + * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0 + * Consumes the buffer chain. + * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE */ -int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) +static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, + u16 *cong_link_cnt) { struct tipc_link *l = tipc_bc_sndlink(net); - struct sk_buff_head xmitq, inputq, rcvq; + struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&rcvq); __skb_queue_head_init(&xmitq); - skb_queue_head_init(&inputq); - - /* Prepare message clone for local node */ - if (unlikely(!tipc_msg_reassemble(list, &rcvq))) - return -EHOSTUNREACH; - tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) - rc = tipc_link_xmit(l, list, &xmitq); + rc = tipc_link_xmit(l, pkts, &xmitq); tipc_bcast_unlock(net); - - /* Don't send to local node if adding to link failed */ - if (unlikely(rc)) { - __skb_queue_purge(&rcvq); - return rc; + tipc_bcbase_xmit(net, &xmitq); + __skb_queue_purge(pkts); + if (rc == -ELINKCONG) { + *cong_link_cnt = 1; + rc = 0; } + return rc; +} - /* Broadcast to all nodes, inluding local node */ - tipc_bcbase_xmit(net, &xmitq); - tipc_sk_mcast_rcv(net, &rcvq, &inputq); - __skb_queue_purge(list); +/* tipc_rcast_xmit - replicate and send a message to given destination nodes + * @net: the applicable net namespace + * @pkts: chain of buffers containing message + * @dests: list of destination nodes + * @cong_link_cnt: returns number of congested links + * @cong_links: returns identities of congested links + * Returns 0 if success, otherwise errno + */ +static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, + struct tipc_nlist *dests, u16 *cong_link_cnt) +{ + struct sk_buff_head _pkts; + struct u32_item *n, *tmp; + u32 dst, selector; + + selector = msg_link_selector(buf_msg(skb_peek(pkts))); + __skb_queue_head_init(&_pkts); + + list_for_each_entry_safe(n, tmp, &dests->list, list) { + dst = n->value; + if (!tipc_msg_pskb_copy(dst, pkts, &_pkts)) + return -ENOMEM; + + /* Any other return value than -ELINKCONG is ignored */ + if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG) + (*cong_link_cnt)++; + } return 0; } +/* tipc_mcast_xmit - deliver message to indicated destination nodes + * and to identified node local sockets + * @net: the applicable net namespace + * @pkts: chain of buffers containing message + * @method: send method to be used + * @dests: destination nodes for message. + * @cong_link_cnt: returns number of encountered congested destination links + * Consumes buffer chain. + * Returns 0 if success, otherwise errno + */ +int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, + struct tipc_mc_method *method, struct tipc_nlist *dests, + u16 *cong_link_cnt) +{ + struct sk_buff_head inputq, localq; + int rc = 0; + + skb_queue_head_init(&inputq); + skb_queue_head_init(&localq); + + /* Clone packets before they are consumed by next call */ + if (dests->local && !tipc_msg_reassemble(pkts, &localq)) { + rc = -ENOMEM; + goto exit; + } + /* Send according to determined transmit method */ + if (dests->remote) { + tipc_bcast_select_xmit_method(net, dests->remote, method); + if (method->rcast) + rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); + else + rc = tipc_bcast_xmit(net, pkts, cong_link_cnt); + } + + if (dests->local) + tipc_sk_mcast_rcv(net, &localq, &inputq); +exit: + /* This queue should normally be empty by now */ + __skb_queue_purge(pkts); + return rc; +} + /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link * * RCU is locked, no other locks set @@ -313,6 +423,7 @@ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l, tipc_bcast_lock(net); tipc_link_add_bc_peer(snd_l, uc_l, xmitq); tipc_bcbase_select_primary(net); + tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); } @@ -331,6 +442,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) tipc_bcast_lock(net); tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); tipc_bcbase_select_primary(net); + tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); @@ -413,6 +525,8 @@ int tipc_bcast_init(struct net *net) goto enomem; bb->link = l; tn->bcl = l; + bb->rc_ratio = 25; + bb->rcast_support = true; return 0; enomem: kfree(bb); @@ -428,3 +542,33 @@ void tipc_bcast_stop(struct net *net) kfree(tn->bcbase); kfree(tn->bcl); } + +void tipc_nlist_init(struct tipc_nlist *nl, u32 self) +{ + memset(nl, 0, sizeof(*nl)); + INIT_LIST_HEAD(&nl->list); + nl->self = self; +} + +void tipc_nlist_add(struct tipc_nlist *nl, u32 node) +{ + if (node == nl->self) + nl->local = true; + else if (u32_push(&nl->list, node)) + nl->remote++; +} + +void tipc_nlist_del(struct tipc_nlist *nl, u32 node) +{ + if (node == nl->self) + nl->local = false; + else if (u32_del(&nl->list, node)) + nl->remote--; +} + +void tipc_nlist_purge(struct tipc_nlist *nl) +{ + u32_list_purge(&nl->list); + nl->remote = 0; + nl->local = 0; +} diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 855d53c64ab3..751530ab0c49 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -42,9 +42,35 @@ struct tipc_node; struct tipc_msg; struct tipc_nl_msg; -struct tipc_node_map; +struct tipc_nlist; +struct tipc_nitem; extern const char tipc_bclink_name[]; +#define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) + +struct tipc_nlist { + struct list_head list; + u32 self; + u16 remote; + bool local; +}; + +void tipc_nlist_init(struct tipc_nlist *nl, u32 self); +void tipc_nlist_purge(struct tipc_nlist *nl); +void tipc_nlist_add(struct tipc_nlist *nl, u32 node); +void tipc_nlist_del(struct tipc_nlist *nl, u32 node); + +/* Cookie to be used between socket and broadcast layer + * @rcast: replicast (instead of broadcast) was used at previous xmit + * @mandatory: broadcast/replicast indication was set by user + * @expires: re-evaluate non-mandatory transmit method if we are past this + */ +struct tipc_mc_method { + bool rcast; + bool mandatory; + unsigned long expires; +}; + int tipc_bcast_init(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, @@ -53,7 +79,10 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); +void tipc_bcast_disable_rcast(struct net *net); +int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, + struct tipc_mc_method *method, struct tipc_nlist *dests, + u16 *cong_link_cnt); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 52d74760fb68..33a5bdfbef76 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -431,7 +431,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); b->bcast_addr.media_id = b->media->type_id; - b->bcast_addr.broadcast = 1; + b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); rcu_assign_pointer(dev->tipc_ptr, b); @@ -482,6 +482,19 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, return 0; } +bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id) +{ + bool supp = false; + struct tipc_bearer *b; + + rcu_read_lock(); + b = bearer_get(net, bearer_id); + if (b) + supp = (b->bcast_addr.broadcast == TIPC_BROADCAST_SUPPORT); + rcu_read_unlock(); + return supp; +} + int tipc_bearer_mtu(struct net *net, u32 bearer_id) { int mtu = 0; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 278ff7f616f9..635c9086e19a 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -60,9 +60,14 @@ #define TIPC_MEDIA_TYPE_IB 2 #define TIPC_MEDIA_TYPE_UDP 3 -/* minimum bearer MTU */ +/* Minimum bearer MTU */ #define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) +/* Identifiers for distinguishing between broadcast/multicast and replicast + */ +#define TIPC_BROADCAST_SUPPORT 1 +#define TIPC_REPLICAST_SUPPORT 2 + /** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) @@ -210,6 +215,7 @@ int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); int tipc_bearer_mtu(struct net *net, u32 bearer_id); +bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id); void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 6b109a808d4c..02462d67d191 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -169,7 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { - rskb = tipc_buf_acquire(MAX_H_SIZE); + rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); if (!rskb) return; tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); @@ -278,7 +278,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, req = kmalloc(sizeof(*req), GFP_ATOMIC); if (!req) return -ENOMEM; - req->buf = tipc_buf_acquire(MAX_H_SIZE); + req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); if (!req->buf) { kfree(req); return -ENOMEM; diff --git a/net/tipc/link.c b/net/tipc/link.c index bda89bf9f4ff..ddd2dd6f77aa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -515,6 +515,10 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, if (link_is_bc_sndlink(l)) l->state = LINK_ESTABLISHED; + /* Disable replicast if even a single peer doesn't support it */ + if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) + tipc_bcast_disable_rcast(net); + return true; } @@ -776,60 +780,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) /** * link_schedule_user - schedule a message sender for wakeup after congestion - * @link: congested link - * @list: message that was attempted sent + * @l: congested link + * @hdr: header of message that is being sent * Create pseudo msg to send back to user when congestion abates - * Does not consume buffer list */ -static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) +static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) { - struct tipc_msg *msg = buf_msg(skb_peek(list)); - int imp = msg_importance(msg); - u32 oport = msg_origport(msg); - u32 addr = tipc_own_addr(link->net); + u32 dnode = tipc_own_addr(l->net); + u32 dport = msg_origport(hdr); struct sk_buff *skb; - /* This really cannot happen... */ - if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { - pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - return -ENOBUFS; - } - /* Non-blocking sender: */ - if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) - return -ELINKCONG; - /* Create and schedule wakeup pseudo message */ skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, - addr, addr, oport, 0, 0); + dnode, l->addr, dport, 0, 0); if (!skb) return -ENOBUFS; - TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); - TIPC_SKB_CB(skb)->chain_imp = imp; - skb_queue_tail(&link->wakeupq, skb); - link->stats.link_congs++; + msg_set_dest_droppable(buf_msg(skb), true); + TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); + skb_queue_tail(&l->wakeupq, skb); + l->stats.link_congs++; return -ELINKCONG; } /** * link_prepare_wakeup - prepare users for wakeup after congestion - * @link: congested link - * Move a number of waiting users, as permitted by available space in - * the send queue, from link wait queue to node wait queue for wakeup + * @l: congested link + * Wake up a number of waiting users, as permitted by available space + * in the send queue */ void link_prepare_wakeup(struct tipc_link *l) { - int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; - int imp, lim; struct sk_buff *skb, *tmp; + int imp, i = 0; skb_queue_walk_safe(&l->wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; - lim = l->backlog[imp].limit; - pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; - if ((pnd[imp] + l->backlog[imp].len) >= lim) + if (l->backlog[imp].len < l->backlog[imp].limit) { + skb_unlink(skb, &l->wakeupq); + skb_queue_tail(l->inputq, skb); + } else if (i++ > 10) { break; - skb_unlink(skb, &l->wakeupq); - skb_queue_tail(l->inputq, skb); + } } } @@ -869,8 +860,7 @@ void tipc_link_reset(struct tipc_link *l) * @list: chain of buffers containing message * @xmitq: returned list of packets to be sent by caller * - * Consumes the buffer chain, except when returning -ELINKCONG, - * since the caller then may want to make more send attempts. + * Consumes the buffer chain. * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ @@ -879,7 +869,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, { struct tipc_msg *hdr = buf_msg(skb_peek(list)); unsigned int maxwin = l->window; - unsigned int i, imp = msg_importance(hdr); + int imp = msg_importance(hdr); unsigned int mtu = l->mtu; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; @@ -888,19 +878,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff *skb, *_skb, *bskb; int pkt_cnt = skb_queue_len(list); + int rc = 0; - /* Match msg importance against this and all higher backlog limits: */ - if (!skb_queue_empty(backlogq)) { - for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { - if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) - return link_schedule_user(l, list); - } - } if (unlikely(msg_size(hdr) > mtu)) { skb_queue_purge(list); return -EMSGSIZE; } + /* Allow oversubscription of one data msg per source at congestion */ + if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { + if (imp == TIPC_SYSTEM_IMPORTANCE) { + pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); + return -ENOBUFS; + } + rc = link_schedule_user(l, hdr); + } + if (pkt_cnt > 1) { l->stats.sent_fragmented++; l->stats.sent_fragments += pkt_cnt; @@ -946,7 +939,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; - return 0; + return rc; } void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -1043,11 +1036,17 @@ int tipc_link_retrans(struct tipc_link *l, u16 from, u16 to, static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { - switch (msg_user(buf_msg(skb))) { + struct tipc_msg *hdr = buf_msg(skb); + + switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: + if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) { + skb_queue_tail(l->bc_rcvlink->inputq, skb); + return true; + } case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; @@ -1395,7 +1394,7 @@ tnl: msg_set_seqno(hdr, seqno++); pktlen = msg_size(hdr); msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); - tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE); + tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC); if (!tnlskb) { pr_warn("%sunable to send packet\n", link_co_err); return; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index a22be502f1bd..312ef7de57d7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -58,12 +58,12 @@ static unsigned int align(unsigned int i) * NOTE: Headroom is reserved to allow prepending of a data link header. * There may also be unrequested tailroom present at the buffer's end. */ -struct sk_buff *tipc_buf_acquire(u32 size) +struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; - skb = alloc_skb_fclone(buf_size, GFP_ATOMIC); + skb = alloc_skb_fclone(buf_size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); @@ -95,7 +95,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type, struct tipc_msg *msg; struct sk_buff *buf; - buf = tipc_buf_acquire(hdr_sz + data_sz); + buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC); if (unlikely(!buf)) return NULL; @@ -261,7 +261,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, /* No fragmentation needed? */ if (likely(msz <= pktmax)) { - skb = tipc_buf_acquire(msz); + skb = tipc_buf_acquire(msz, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; skb_orphan(skb); @@ -282,7 +282,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ - skb = tipc_buf_acquire(pktmax); + skb = tipc_buf_acquire(pktmax, GFP_KERNEL); if (!skb) return -ENOMEM; skb_orphan(skb); @@ -313,7 +313,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, pktsz = drem + INT_H_SIZE; else pktsz = pktmax; - skb = tipc_buf_acquire(pktsz); + skb = tipc_buf_acquire(pktsz, GFP_KERNEL); if (!skb) { rc = -ENOMEM; goto error; @@ -448,7 +448,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, if (msz > (max / 2)) return false; - _skb = tipc_buf_acquire(max); + _skb = tipc_buf_acquire(max, GFP_ATOMIC); if (!_skb) return false; @@ -496,7 +496,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) /* Never return SHORT header; expand by replacing buffer if necessary */ if (msg_short(hdr)) { - *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen); + *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC); if (!*skb) goto exit; memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); @@ -607,6 +607,23 @@ error: return false; } +bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, + struct sk_buff_head *cpy) +{ + struct sk_buff *skb, *_skb; + + skb_queue_walk(msg, skb) { + _skb = pskb_copy(skb, GFP_ATOMIC); + if (!_skb) { + __skb_queue_purge(cpy); + return false; + } + msg_set_destnode(buf_msg(_skb), dst); + __skb_queue_tail(cpy, _skb); + } + return true; +} + /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number * @list: list to be appended to * @seqno: sequence number of buffer to add diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 8d408612ffa4..c843fd2bc48d 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -98,8 +98,6 @@ struct tipc_skb_cb { u32 bytes_read; struct sk_buff *tail; bool validated; - bool wakeup_pending; - u16 chain_sz; u16 chain_imp; u16 ackers; }; @@ -633,14 +631,11 @@ static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) static inline u32 msg_link_selector(struct tipc_msg *m) { + if (msg_user(m) == MSG_FRAGMENTER) + m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } -static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) -{ - msg_set_bits(m, 4, 0, 1, n); -} - /* * Word 5 */ @@ -820,7 +815,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } -struct sk_buff *tipc_buf_acquire(u32 size); +struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, @@ -837,6 +832,8 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); +bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, + struct sk_buff_head *cpy); void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index c1cfd92de17a..23f8899e0f8c 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -69,7 +69,7 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size); + struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); struct tipc_msg *msg; if (buf != NULL) { diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e190460fe0d3..9be6592e4a6f 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -608,7 +608,7 @@ not_found: * Returns non-zero if any off-node ports overlap */ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct tipc_plist *dports) + u32 limit, struct list_head *dports) { struct name_seq *seq; struct sub_seq *sseq; @@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, info = sseq->info; list_for_each_entry(publ, &info->node_list, node_list) { if (publ->scope <= limit) - tipc_plist_push(dports, publ->ref); + u32_push(dports, publ->ref); } if (info->cluster_list_size != info->node_list_size) @@ -645,6 +645,39 @@ exit: return res; } +/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes + * - Creates list of nodes that overlap the given multicast address + * - Determines if any node local ports overlap + */ +void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, + u32 upper, u32 domain, + struct tipc_nlist *nodes) +{ + struct sub_seq *sseq, *stop; + struct publication *publ; + struct name_info *info; + struct name_seq *seq; + + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); + stop = seq->sseqs + seq->first_free; + for (; sseq->lower <= upper && sseq != stop; sseq++) { + info = sseq->info; + list_for_each_entry(publ, &info->zone_list, zone_list) { + if (tipc_in_scope(domain, publ->node)) + tipc_nlist_add(nodes, publ->node); + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); +} + /* * tipc_nametbl_publish - add name publication to network name tables */ @@ -1022,40 +1055,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void tipc_plist_push(struct tipc_plist *pl, u32 port) +bool u32_find(struct list_head *l, u32 value) { - struct tipc_plist *nl; + struct u32_item *item; - if (likely(!pl->port)) { - pl->port = port; - return; + list_for_each_entry(item, l, list) { + if (item->value == value) + return true; } - if (pl->port == port) - return; - list_for_each_entry(nl, &pl->list, list) { - if (nl->port == port) - return; + return false; +} + +bool u32_push(struct list_head *l, u32 value) +{ + struct u32_item *item; + + list_for_each_entry(item, l, list) { + if (item->value == value) + return false; + } + item = kmalloc(sizeof(*item), GFP_ATOMIC); + if (unlikely(!item)) + return false; + + item->value = value; + list_add(&item->list, l); + return true; +} + +u32 u32_pop(struct list_head *l) +{ + struct u32_item *item; + u32 value = 0; + + if (list_empty(l)) + return 0; + item = list_first_entry(l, typeof(*item), list); + value = item->value; + list_del(&item->list); + kfree(item); + return value; +} + +bool u32_del(struct list_head *l, u32 value) +{ + struct u32_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, l, list) { + if (item->value != value) + continue; + list_del(&item->list); + kfree(item); + return true; } - nl = kmalloc(sizeof(*nl), GFP_ATOMIC); - if (nl) { - nl->port = port; - list_add(&nl->list, &pl->list); + return false; +} + +void u32_list_purge(struct list_head *l) +{ + struct u32_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, l, list) { + list_del(&item->list); + kfree(item); } } -u32 tipc_plist_pop(struct tipc_plist *pl) +int u32_list_len(struct list_head *l) { - struct tipc_plist *nl; - u32 port = 0; + struct u32_item *item; + int i = 0; - if (likely(list_empty(&pl->list))) { - port = pl->port; - pl->port = 0; - return port; + list_for_each_entry(item, l, list) { + i++; } - nl = list_first_entry(&pl->list, typeof(*nl), list); - port = nl->port; - list_del(&nl->list); - kfree(nl); - return port; + return i; } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 1524a73830f7..6ebdeb1d84a5 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -39,6 +39,7 @@ struct tipc_subscription; struct tipc_plist; +struct tipc_nlist; /* * TIPC name types reserved for internal TIPC use (both current and planned) @@ -99,7 +100,10 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct tipc_plist *dports); + u32 limit, struct list_head *dports); +void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, + u32 upper, u32 domain, + struct tipc_nlist *nodes); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key); @@ -116,18 +120,16 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); -struct tipc_plist { +struct u32_item { struct list_head list; - u32 port; + u32 value; }; -static inline void tipc_plist_init(struct tipc_plist *pl) -{ - INIT_LIST_HEAD(&pl->list); - pl->port = 0; -} - -void tipc_plist_push(struct tipc_plist *pl, u32 port); -u32 tipc_plist_pop(struct tipc_plist *pl); +bool u32_push(struct list_head *l, u32 value); +u32 u32_pop(struct list_head *l); +bool u32_find(struct list_head *l, u32 value); +bool u32_del(struct list_head *l, u32 value); +void u32_list_purge(struct list_head *l); +int u32_list_len(struct list_head *l); #endif diff --git a/net/tipc/net.c b/net/tipc/net.c index 28bf4feeb81c..ab8a2d5d1e32 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -110,6 +110,10 @@ int tipc_net_start(struct net *net, u32 addr) char addr_string[16]; tn->own_addr = addr; + + /* Ensure that the new address is visible before we reinit. */ + smp_mb(); + tipc_named_reinit(net); tipc_sk_reinit(net); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9d2f4c2b08ab..4512e83652b1 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -263,6 +263,11 @@ static void tipc_node_write_lock(struct tipc_node *n) write_lock_bh(&n->lock); } +static void tipc_node_write_unlock_fast(struct tipc_node *n) +{ + write_unlock_bh(&n->lock); +} + static void tipc_node_write_unlock(struct tipc_node *n) { struct net *net = n->net; @@ -417,7 +422,7 @@ void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); - tipc_node_write_unlock(n); + tipc_node_write_unlock_fast(n); tipc_node_put(n); } @@ -435,7 +440,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) } tipc_node_write_lock(n); list_del_init(subscr); - tipc_node_write_unlock(n); + tipc_node_write_unlock_fast(n); tipc_node_put(n); } @@ -1167,7 +1172,7 @@ msg_full: * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning -ELINKCONG + * Consumes the buffer chain. * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, @@ -1206,10 +1211,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); - if (likely(rc == 0)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) + if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); + else + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); tipc_node_put(n); @@ -1221,20 +1226,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. - * TODO: Return real return value, and let callers use - * tipc_wait_for_sendpkt() where applicable */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; - int rc; skb_queue_head_init(&head); __skb_queue_tail(&head, skb); - rc = tipc_node_xmit(net, &head, dnode, selector); - if (rc == -ELINKCONG) - kfree_skb(skb); + tipc_node_xmit(net, &head, dnode, selector); return 0; } @@ -1262,6 +1262,19 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) kfree_skb(skb); } +static void tipc_node_mcast_rcv(struct tipc_node *n) +{ + struct tipc_bclink_entry *be = &n->bc_entry; + + /* 'arrvq' is under inputq2's lock protection */ + spin_lock_bh(&be->inputq2.lock); + spin_lock_bh(&be->inputq1.lock); + skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); + spin_unlock_bh(&be->inputq1.lock); + spin_unlock_bh(&be->inputq2.lock); + tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); +} + static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { @@ -1335,15 +1348,8 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - /* Deliver. 'arrvq' is under inputq2's lock protection */ - if (!skb_queue_empty(&be->inputq1)) { - spin_lock_bh(&be->inputq2.lock); - spin_lock_bh(&be->inputq1.lock); - skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); - spin_unlock_bh(&be->inputq1.lock); - spin_unlock_bh(&be->inputq2.lock); - tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2); - } + if (!skb_queue_empty(&be->inputq1)) + tipc_node_mcast_rcv(n); if (rc & TIPC_LINK_DOWN_EVT) { /* Reception reassembly failure => reset all links to peer */ @@ -1499,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_node *n; - struct tipc_msg *hdr = buf_msg(skb); - int usr = msg_user(hdr); + struct tipc_msg *hdr; int bearer_id = b->identity; struct tipc_link_entry *le; - u16 bc_ack = msg_bcast_ack(hdr); u32 self = tipc_own_addr(net); - int rc = 0; + int usr, rc = 0; + u16 bc_ack; __skb_queue_head_init(&xmitq); - /* Ensure message is well-formed */ + /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(skb))) goto discard; + hdr = buf_msg(skb); + usr = msg_user(hdr); + bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { @@ -1570,6 +1578,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq); + if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) + tipc_node_mcast_rcv(n); + if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); diff --git a/net/tipc/node.h b/net/tipc/node.h index 39ef54c1f2ad..898c22916984 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -47,11 +47,13 @@ enum { TIPC_BCAST_SYNCH = (1 << 1), TIPC_BCAST_STATE_NACK = (1 << 2), - TIPC_BLOCK_FLOWCTL = (1 << 3) + TIPC_BLOCK_FLOWCTL = (1 << 3), + TIPC_BCAST_RCAST = (1 << 4) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ TIPC_BCAST_STATE_NACK | \ + TIPC_BCAST_RCAST | \ TIPC_BLOCK_FLOWCTL) #define INVALID_BEARER_ID -1 diff --git a/net/tipc/server.c b/net/tipc/server.c index 215849ce453d..3cd6402e812c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -86,12 +86,12 @@ struct outqueue_entry { static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); static void tipc_clean_outqueues(struct tipc_conn *con); -static void tipc_sock_release(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); - struct sockaddr_tipc *saddr = con->server->saddr; + struct tipc_server *s = con->server; + struct sockaddr_tipc *saddr = s->saddr; struct socket *sock = con->sock; struct sock *sk; @@ -103,9 +103,13 @@ static void tipc_conn_kref_release(struct kref *kref) } saddr->scope = -TIPC_NODE_SCOPE; kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); - tipc_sock_release(con); sock_release(sock); con->sock = NULL; + + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); } tipc_clean_outqueues(con); @@ -128,8 +132,10 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) spin_lock_bh(&s->idr_lock); con = idr_find(&s->conn_idr, conid); - if (con) + if (con && test_bit(CF_CONNECTED, &con->flags)) conn_get(con); + else + con = NULL; spin_unlock_bh(&s->idr_lock); return con; } @@ -186,26 +192,15 @@ static void tipc_unregister_callbacks(struct tipc_conn *con) write_unlock_bh(&sk->sk_callback_lock); } -static void tipc_sock_release(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - - if (con->conid) - s->tipc_conn_release(con->conid, con->usr_data); - - tipc_unregister_callbacks(con); -} - static void tipc_close_conn(struct tipc_conn *con) { struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { + tipc_unregister_callbacks(con); - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); + if (con->conid) + s->tipc_conn_release(con->conid, con->usr_data); /* We shouldn't flush pending works as we may be in the * thread. In fact the races with pending rx/tx work structs @@ -458,6 +453,11 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, if (!con) return -EINVAL; + if (!test_bit(CF_CONNECTED, &con->flags)) { + conn_put(con); + return 0; + } + e = tipc_alloc_entry(data, len); if (!e) { conn_put(con); @@ -471,12 +471,8 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); - if (test_bit(CF_CONNECTED, &con->flags)) { - if (!queue_work(s->send_wq, &con->swork)) - conn_put(con); - } else { + if (!queue_work(s->send_wq, &con->swork)) conn_put(con); - } return 0; } @@ -500,7 +496,7 @@ static void tipc_send_to_sock(struct tipc_conn *con) int ret; spin_lock_bh(&con->outqueue_lock); - while (1) { + while (test_bit(CF_CONNECTED, &con->flags)) { e = list_entry(con->outqueue.next, struct outqueue_entry, list); if ((struct list_head *) e == &con->outqueue) @@ -623,14 +619,12 @@ int tipc_server_start(struct tipc_server *s) void tipc_server_stop(struct tipc_server *s) { struct tipc_conn *con; - int total = 0; int id; spin_lock_bh(&s->idr_lock); - for (id = 0; total < s->idr_in_use; id++) { + for (id = 0; s->idr_in_use; id++) { con = idr_find(&s->conn_idr, id); if (con) { - total++; spin_unlock_bh(&s->idr_lock); tipc_close_conn(con); spin_lock_bh(&s->idr_lock); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 333c5dae0072..6b09a778cc71 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -67,16 +67,19 @@ enum { * @max_pkt: maximum packet size "hint" used when building messages sent by port * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages + * #cong_links: list of congested links * @publications: list of publications for port + * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @probing_state: * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue - * @link_cong: non-zero if owner must sleep because of link congestion + * @cong_link_cnt: number of congested links * @sent_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node + * @mc_method: cookie for use between socket and broadcast layer * @rcu: rcu struct for tipc_sock */ struct tipc_sock { @@ -87,13 +90,13 @@ struct tipc_sock { u32 max_pkt; u32 portid; struct tipc_msg phdr; - struct list_head sock_list; + struct list_head cong_links; struct list_head publications; u32 pub_count; uint conn_timeout; atomic_t dupl_rcvcnt; bool probe_unacked; - bool link_cong; + u16 cong_link_cnt; u16 snt_unacked; u16 snd_win; u16 peer_caps; @@ -101,6 +104,7 @@ struct tipc_sock { u16 rcv_win; struct sockaddr_tipc peer; struct rhash_head node; + struct tipc_mc_method mc_method; struct rcu_head rcu; }; @@ -110,7 +114,6 @@ static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); -static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); static void tipc_sk_timeout(unsigned long data); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); @@ -119,8 +122,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); -static int __tipc_send_stream(struct socket *sock, struct msghdr *m, - size_t dsz); +static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static const struct proto_ops packet_ops; @@ -334,6 +336,49 @@ static int tipc_set_sk_state(struct sock *sk, int state) return res; } +static int tipc_sk_sock_err(struct socket *sock, long *timeout) +{ + struct sock *sk = sock->sk; + int err = sock_error(sk); + int typ = sock->type; + + if (err) + return err; + if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) { + if (sk->sk_state == TIPC_DISCONNECTING) + return -EPIPE; + else if (!tipc_sk_connected(sk)) + return -ENOTCONN; + } + if (!*timeout) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(*timeout); + + return 0; +} + +#define tipc_wait_for_cond(sock_, timeout_, condition_) \ +({ \ + int rc_ = 0; \ + int done_ = 0; \ + \ + while (!(condition_) && !done_) { \ + struct sock *sk_ = sock->sk; \ + DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ + \ + rc_ = tipc_sk_sock_err(sock_, timeout_); \ + if (rc_) \ + break; \ + prepare_to_wait(sk_sleep(sk_), &wait_, \ + TASK_INTERRUPTIBLE); \ + done_ = sk_wait_event(sk_, timeout_, \ + (condition_), &wait_); \ + remove_wait_queue(sk_sleep(sk_), &wait_); \ + } \ + rc_; \ +}) + /** * tipc_sk_create - create a TIPC socket * @net: network namespace (must be default network) @@ -382,10 +427,9 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; INIT_LIST_HEAD(&tsk->publications); + INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); /* Finish initializing socket data structures */ sock->ops = ops; @@ -395,6 +439,13 @@ static int tipc_sk_create(struct net *net, struct socket *sock, pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } + + /* Ensure tsk is visible before we read own_addr. */ + smp_mb(); + + tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); + msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; @@ -432,24 +483,33 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); + long timeout = CONN_TIMEOUT_DEFAULT; u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; + /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */ + tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && + !tsk_conn_cong(tsk))); + /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (TIPC_SKB_CB(skb)->bytes_read) { kfree_skb(skb); - } else { - if (!tipc_sk_type_connectionless(sk) && - sk->sk_state != TIPC_DISCONNECTING) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, dnode, tsk->portid); - } - tipc_sk_respond(sk, skb, error); + continue; } + if (!tipc_sk_type_connectionless(sk) && + sk->sk_state != TIPC_DISCONNECTING) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, dnode, tsk->portid); + } + tipc_sk_respond(sk, skb, error); } + + if (tipc_sk_type_connectionless(sk)) + return; + if (sk->sk_state != TIPC_DISCONNECTING) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, @@ -457,10 +517,8 @@ static void __tipc_shutdown(struct socket *sock, int error) tsk->portid, error); if (skb) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); - if (!tipc_sk_type_connectionless(sk)) { - tipc_node_remove_conn(net, dnode, tsk->portid); - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - } + tipc_node_remove_conn(net, dnode, tsk->portid); + tipc_set_sk_state(sk, TIPC_DISCONNECTING); } } @@ -503,7 +561,8 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - + u32_list_purge(&tsk->cong_links); + tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -646,7 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: - if (!tsk->link_cong && !tsk_conn_cong(tsk)) + if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: @@ -655,7 +714,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); break; case TIPC_OPEN: - if (!tsk->link_cong) + if (!tsk->cong_link_cnt) mask |= POLLOUT; if (tipc_sk_type_connectionless(sk) && (!skb_queue_empty(&sk->sk_receive_queue))) @@ -674,63 +733,60 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, * @sock: socket structure * @seq: destination address * @msg: message to send - * @dsz: total length of message data - * @timeo: timeout to wait for wakeup + * @dlen: length of data to send + * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, - struct msghdr *msg, size_t dsz, long timeo) + struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head pktchain; - struct iov_iter save = msg->msg_iter; - uint mtu; + int mtu = tipc_bcast_get_mtu(net); + struct tipc_mc_method *method = &tsk->mc_method; + u32 domain = addr_domain(net, TIPC_CLUSTER_SCOPE); + struct sk_buff_head pkts; + struct tipc_nlist dsts; int rc; - if (!timeo && tsk->link_cong) - return -ELINKCONG; + /* Block or return if any destination link is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; - msg_set_type(mhdr, TIPC_MCAST_MSG); - msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); - msg_set_destport(mhdr, 0); - msg_set_destnode(mhdr, 0); - msg_set_nametype(mhdr, seq->type); - msg_set_namelower(mhdr, seq->lower); - msg_set_nameupper(mhdr, seq->upper); - msg_set_hdr_sz(mhdr, MCAST_H_SIZE); + /* Lookup destination nodes */ + tipc_nlist_init(&dsts, tipc_own_addr(net)); + tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower, + seq->upper, domain, &dsts); + if (!dsts.local && !dsts.remote) + return -EHOSTUNREACH; - skb_queue_head_init(&pktchain); + /* Build message header */ + msg_set_type(hdr, TIPC_MCAST_MSG); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_nametype(hdr, seq->type); + msg_set_namelower(hdr, seq->lower); + msg_set_nameupper(hdr, seq->upper); -new_mtu: - mtu = tipc_bcast_get_mtu(net); - rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain); - if (unlikely(rc < 0)) - return rc; + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts); - do { - rc = tipc_bcast_xmit(net, &pktchain); - if (likely(!rc)) - return dsz; - - if (rc == -ELINKCONG) { - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (!rc) - continue; - } - __skb_queue_purge(&pktchain); - if (rc == -EMSGSIZE) { - msg->msg_iter = save; - goto new_mtu; - } - break; - } while (1); - return rc; + /* Send message if build was successful */ + if (unlikely(rc == dlen)) + rc = tipc_mcast_xmit(net, &pkts, method, &dsts, + &tsk->cong_link_cnt); + + tipc_nlist_purge(&dsts); + + return rc ? rc : dlen; } /** @@ -744,7 +800,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { struct tipc_msg *msg; - struct tipc_plist dports; + struct list_head dports; u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; struct sk_buff_head tmpq; @@ -752,7 +808,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff *skb, *_skb; __skb_queue_head_init(&tmpq); - tipc_plist_init(&dports); + INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { @@ -766,8 +822,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_nametbl_mc_translate(net, msg_nametype(msg), msg_namelower(msg), msg_nameupper(msg), scope, &dports); - portid = tipc_plist_pop(&dports); - for (; portid; portid = tipc_plist_pop(&dports)) { + portid = u32_pop(&dports); + for (; portid; portid = u32_pop(&dports)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -828,31 +884,6 @@ exit: kfree_skb(skb); } -static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - int done; - - do { - int err = sock_error(sk); - if (err) - return err; - if (sk->sk_shutdown & SEND_SHUTDOWN) - return -EPIPE; - if (!*timeo_p) - return -EAGAIN; - if (signal_pending(current)) - return sock_intr_errno(*timeo_p); - - add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait); - remove_wait_queue(sk_sleep(sk), &wait); - } while (!done); - return 0; -} - /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -879,35 +910,38 @@ static int tipc_sendmsg(struct socket *sock, return ret; } -static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) +static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - struct tipc_msg *mhdr = &tsk->phdr; - u32 dnode, dport; - struct sk_buff_head pktchain; - bool is_connectionless = tipc_sk_type_connectionless(sk); - struct sk_buff *skb; + struct tipc_sock *tsk = tipc_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct list_head *clinks = &tsk->cong_links; + bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; - struct iov_iter save; - u32 mtu; - long timeo; - int rc; + struct sk_buff_head pkts; + u32 type, inst, domain; + u32 dnode, dport; + int mtu, rc; - if (dsz > TIPC_MAX_USER_MSG_SIZE) + if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (unlikely(!dest)) { - if (is_connectionless && tsk->peer.family == AF_TIPC) - dest = &tsk->peer; - else + dest = &tsk->peer; + if (!syn || dest->family != AF_TIPC) return -EDESTADDRREQ; - } else if (unlikely(m->msg_namelen < sizeof(*dest)) || - dest->family != AF_TIPC) { - return -EINVAL; } - if (!is_connectionless) { + + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + + if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; if (sk->sk_state != TIPC_OPEN) @@ -919,102 +953,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) tsk->conn_instance = dest->addr.name.name.instance; } } - seq = &dest->addr.nameseq; - timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - if (dest->addrtype == TIPC_ADDR_MCAST) { - return tipc_sendmcast(sock, seq, m, dsz, timeo); - } else if (dest->addrtype == TIPC_ADDR_NAME) { - u32 type = dest->addr.name.name.type; - u32 inst = dest->addr.name.name.instance; - u32 domain = dest->addr.name.domain; + seq = &dest->addr.nameseq; + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_sendmcast(sock, seq, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) { + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = dest->addr.name.domain; dnode = domain; - msg_set_type(mhdr, TIPC_NAMED_MSG); - msg_set_hdr_sz(mhdr, NAMED_H_SIZE); - msg_set_nametype(mhdr, type); - msg_set_nameinst(mhdr, inst); - msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); + msg_set_type(hdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(hdr, NAMED_H_SIZE); + msg_set_nametype(hdr, type); + msg_set_nameinst(hdr, inst); + msg_set_lookup_scope(hdr, tipc_addr_scope(domain)); dport = tipc_nametbl_translate(net, type, inst, &dnode); - msg_set_destnode(mhdr, dnode); - msg_set_destport(mhdr, dport); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; + } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; - msg_set_type(mhdr, TIPC_DIRECT_MSG); - msg_set_lookup_scope(mhdr, 0); - msg_set_destnode(mhdr, dnode); - msg_set_destport(mhdr, dest->addr.id.ref); - msg_set_hdr_sz(mhdr, BASIC_H_SIZE); + msg_set_type(hdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(hdr, 0); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dest->addr.id.ref); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - skb_queue_head_init(&pktchain); - save = m->msg_iter; -new_mtu: - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); - rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain); - if (rc < 0) + /* Block or return if destination link is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + if (unlikely(rc)) return rc; - do { - skb = skb_peek(&pktchain); - TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); - if (likely(!rc)) { - if (!is_connectionless) - tipc_set_sk_state(sk, TIPC_CONNECTING); - return dsz; - } - if (rc == -ELINKCONG) { - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (!rc) - continue; - } - __skb_queue_purge(&pktchain); - if (rc == -EMSGSIZE) { - m->msg_iter = save; - goto new_mtu; - } - break; - } while (1); - - return rc; -} + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; -static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - int done; + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + u32_push(clinks, dnode); + tsk->cong_link_cnt++; + rc = 0; + } - do { - int err = sock_error(sk); - if (err) - return err; - if (sk->sk_state == TIPC_DISCONNECTING) - return -EPIPE; - else if (!tipc_sk_connected(sk)) - return -ENOTCONN; - if (!*timeo_p) - return -EAGAIN; - if (signal_pending(current)) - return sock_intr_errno(*timeo_p); + if (unlikely(syn && !rc)) + tipc_set_sk_state(sk, TIPC_CONNECTING); - add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, - (!tsk->link_cong && - !tsk_conn_cong(tsk)) || - !tipc_sk_connected(sk), &wait); - remove_wait_queue(sk_sleep(sk), &wait); - } while (!done); - return 0; + return rc ? rc : dlen; } /** - * tipc_send_stream - send stream-oriented data + * tipc_sendstream - send stream-oriented data * @sock: socket structure * @m: data to send * @dsz: total length of data to be transmitted @@ -1024,94 +1018,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) * Returns the number of bytes sent on success (or partial success), * or errno if no data sent */ -static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) +static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; int ret; lock_sock(sk); - ret = __tipc_send_stream(sock, m, dsz); + ret = __tipc_sendstream(sock, m, dsz); release_sock(sk); return ret; } -static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) +static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head pktchain; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - u32 portid = tsk->portid; - int rc = -EINVAL; - long timeo; - u32 dnode; - uint mtu, send, sent = 0; - struct iov_iter save; - int hlen = MIN_H_SIZE; - - /* Handle implied connection establishment */ - if (unlikely(dest)) { - rc = __tipc_sendmsg(sock, m, dsz); - hlen = msg_hdr_sz(mhdr); - if (dsz && (dsz == rc)) - tsk->snt_unacked = tsk_inc(tsk, dsz + hlen); - return rc; - } - if (dsz > (uint)INT_MAX) - return -EMSGSIZE; - - if (unlikely(!tipc_sk_connected(sk))) { - if (sk->sk_state == TIPC_DISCONNECTING) - return -EPIPE; - else - return -ENOTCONN; - } + long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = &tsk->phdr; + struct net *net = sock_net(sk); + struct sk_buff_head pkts; + u32 dnode = tsk_peer_node(tsk); + int send, sent = 0; + int rc = 0; - timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - if (!timeo && tsk->link_cong) - return -ELINKCONG; + skb_queue_head_init(&pkts); - dnode = tsk_peer_node(tsk); - skb_queue_head_init(&pktchain); + if (unlikely(dlen > INT_MAX)) + return -EMSGSIZE; -next: - save = m->msg_iter; - mtu = tsk->max_pkt; - send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); - if (unlikely(rc < 0)) + /* Handle implicit connection setup */ + if (unlikely(dest)) { + rc = __tipc_sendmsg(sock, m, dlen); + if (dlen && (dlen == rc)) + tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); return rc; + } do { - if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_node_xmit(net, &pktchain, dnode, portid); - if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + hlen); - sent += send; - if (sent == dsz) - return dsz; - goto next; - } - if (rc == -EMSGSIZE) { - __skb_queue_purge(&pktchain); - tsk->max_pkt = tipc_node_get_mtu(net, dnode, - portid); - m->msg_iter = save; - goto next; - } - if (rc != -ELINKCONG) - break; + rc = tipc_wait_for_cond(sock, &timeout, + (!tsk->cong_link_cnt && + !tsk_conn_cong(tsk) && + tipc_sk_connected(sk))); + if (unlikely(rc)) + break; + + send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); + rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); + if (unlikely(rc != send)) + break; - tsk->link_cong = 1; + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tsk->cong_link_cnt = 1; + rc = 0; + } + if (likely(!rc)) { + tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + sent += send; } - rc = tipc_wait_for_sndpkt(sock, &timeo); - } while (!rc); + } while (sent < dlen && !rc); - __skb_queue_purge(&pktchain); - return sent ? sent : rc; + return rc ? rc : sent; } /** @@ -1129,7 +1098,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; - return tipc_send_stream(sock, m, dsz); + return tipc_sendstream(sock, m, dsz); } /* tipc_sk_finish_conn - complete the setup of a connection @@ -1696,6 +1665,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, unsigned int limit = rcvbuf_limit(sk, skb); int err = TIPC_OK; int usr = msg_user(hdr); + u32 onode; if (unlikely(msg_user(hdr) == CONN_MANAGER)) { tipc_sk_proto_rcv(tsk, skb, xmitq); @@ -1703,8 +1673,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, } if (unlikely(usr == SOCK_WAKEUP)) { + onode = msg_orignode(hdr); kfree_skb(skb); - tsk->link_cong = 0; + u32_del(&tsk->cong_links, onode); + tsk->cong_link_cnt--; sk->sk_write_space(sk); return false; } @@ -2112,7 +2084,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) struct msghdr m = {NULL,}; tsk_advance_rx_queue(sk); - __tipc_send_stream(new_sock, &m, 0); + __tipc_sendstream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); @@ -2267,24 +2239,27 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - const struct bucket_table *tbl; - struct rhash_head *pos; + struct rhashtable_iter iter; struct tipc_sock *tsk; struct tipc_msg *msg; - int i; - rcu_read_lock(); - tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(tsk, pos, tbl, i, node) { + rhashtable_walk_enter(&tn->sk_rht, &iter); + + do { + tsk = ERR_PTR(rhashtable_walk_start(&iter)); + if (tsk) + continue; + + while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; msg_set_prevnode(msg, tn->own_addr); msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - } - rcu_read_unlock(); + + rhashtable_walk_stop(&iter); + } while (tsk == ERR_PTR(-EAGAIN)); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) @@ -2380,18 +2355,29 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - u32 value; - int res; + u32 value = 0; + int res = 0; if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) return 0; if (lvl != SOL_TIPC) return -ENOPROTOOPT; - if (ol < sizeof(value)) - return -EINVAL; - res = get_user(value, (u32 __user *)ov); - if (res) - return res; + + switch (opt) { + case TIPC_IMPORTANCE: + case TIPC_SRC_DROPPABLE: + case TIPC_DEST_DROPPABLE: + case TIPC_CONN_TIMEOUT: + if (ol < sizeof(value)) + return -EINVAL; + res = get_user(value, (u32 __user *)ov); + if (res) + return res; + break; + default: + if (ov || ol) + return -EINVAL; + } lock_sock(sk); @@ -2410,7 +2396,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; - /* no need to set "res", since already 0 at this point */ + break; + case TIPC_MCAST_BROADCAST: + tsk->mc_method.rcast = false; + tsk->mc_method.mandatory = true; + break; + case TIPC_MCAST_REPLICAST: + tsk->mc_method.rcast = true; + tsk->mc_method.mandatory = true; break; default: res = -EINVAL; @@ -2573,7 +2566,7 @@ static const struct proto_ops stream_ops = { .shutdown = tipc_shutdown, .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, - .sendmsg = tipc_send_stream, + .sendmsg = tipc_sendstream, .recvmsg = tipc_recv_stream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 0dd02244e21d..9d94e65d0894 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -54,6 +54,8 @@ struct tipc_subscriber { static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); +static void tipc_subscrp_put(struct tipc_subscription *subscription); +static void tipc_subscrp_get(struct tipc_subscription *subscription); /** * htohl - convert value to endianness used by destination @@ -123,6 +125,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, { struct tipc_name_seq seq; + tipc_subscrp_get(sub); tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; @@ -132,30 +135,23 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, node); + tipc_subscrp_put(sub); } static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; - struct tipc_subscriber *subscriber = sub->subscriber; /* Notify subscriber of timeout */ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, TIPC_SUBSCR_TIMEOUT, 0, 0); - spin_lock_bh(&subscriber->lock); - tipc_subscrp_delete(sub); - spin_unlock_bh(&subscriber->lock); - - tipc_subscrb_put(subscriber); + tipc_subscrp_put(sub); } static void tipc_subscrb_kref_release(struct kref *kref) { - struct tipc_subscriber *subcriber = container_of(kref, - struct tipc_subscriber, kref); - - kfree(subcriber); + kfree(container_of(kref,struct tipc_subscriber, kref)); } static void tipc_subscrb_put(struct tipc_subscriber *subscriber) @@ -168,6 +164,59 @@ static void tipc_subscrb_get(struct tipc_subscriber *subscriber) kref_get(&subscriber->kref); } +static void tipc_subscrp_kref_release(struct kref *kref) +{ + struct tipc_subscription *sub = container_of(kref, + struct tipc_subscription, + kref); + struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscriber *subscriber = sub->subscriber; + + spin_lock_bh(&subscriber->lock); + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + atomic_dec(&tn->subscription_count); + spin_unlock_bh(&subscriber->lock); + kfree(sub); + tipc_subscrb_put(subscriber); +} + +static void tipc_subscrp_put(struct tipc_subscription *subscription) +{ + kref_put(&subscription->kref, tipc_subscrp_kref_release); +} + +static void tipc_subscrp_get(struct tipc_subscription *subscription) +{ + kref_get(&subscription->kref); +} + +/* tipc_subscrb_subscrp_delete - delete a specific subscription or all + * subscriptions for a given subscriber. + */ +static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, + struct tipc_subscr *s) +{ + struct list_head *subscription_list = &subscriber->subscrp_list; + struct tipc_subscription *sub, *temp; + + spin_lock_bh(&subscriber->lock); + list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { + if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) + continue; + + tipc_subscrp_get(sub); + spin_unlock_bh(&subscriber->lock); + tipc_subscrp_delete(sub); + tipc_subscrp_put(sub); + spin_lock_bh(&subscriber->lock); + + if (s) + break; + } + spin_unlock_bh(&subscriber->lock); +} + static struct tipc_subscriber *tipc_subscrb_create(int conid) { struct tipc_subscriber *subscriber; @@ -177,8 +226,8 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) pr_warn("Subscriber rejected, no memory\n"); return NULL; } - kref_init(&subscriber->kref); INIT_LIST_HEAD(&subscriber->subscrp_list); + kref_init(&subscriber->kref); subscriber->conid = conid; spin_lock_init(&subscriber->lock); @@ -187,55 +236,22 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, - subscrp_list) { - timeout = htohl(sub->evt.s.timeout, sub->swap); - if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) { - tipc_subscrp_delete(sub); - tipc_subscrb_put(subscriber); - } - } - spin_unlock_bh(&subscriber->lock); - + tipc_subscrb_subscrp_delete(subscriber, NULL); tipc_subscrb_put(subscriber); } static void tipc_subscrp_delete(struct tipc_subscription *sub) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - kfree(sub); - atomic_dec(&tn->subscription_count); + if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) + tipc_subscrp_put(sub); } static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, - subscrp_list) { - if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - timeout = htohl(sub->evt.s.timeout, sub->swap); - if ((timeout == TIPC_WAIT_FOREVER) || - del_timer(&sub->timer)) { - tipc_subscrp_delete(sub); - tipc_subscrb_put(subscriber); - } - break; - } - } - spin_unlock_bh(&subscriber->lock); + tipc_subscrb_subscrp_delete(subscriber, s); } static struct tipc_subscription *tipc_subscrp_create(struct net *net, @@ -272,6 +288,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); + kref_init(&sub->kref); return sub; } @@ -288,17 +305,16 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); - tipc_subscrb_get(subscriber); sub->subscriber = subscriber; tipc_nametbl_subscribe(sub); + tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); timeout = htohl(sub->evt.s.timeout, swap); - if (timeout == TIPC_WAIT_FOREVER) - return; - setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); - mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); + if (timeout != TIPC_WAIT_FOREVER) + mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); } /* Handle one termination request for the subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index be60103082c9..ffdc214c117a 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -57,6 +57,7 @@ struct tipc_subscriber; * @evt: template for events generated by subscription */ struct tipc_subscription { + struct kref kref; struct tipc_subscriber *subscriber; struct net *net; struct timer_list timer; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index b58dc95f3d35..46061cf48cd1 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -113,7 +113,7 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, memcpy(addr->value, ua, sizeof(struct udp_media_addr)); if (tipc_udp_is_mcast_addr(ua)) - addr->broadcast = 1; + addr->broadcast = TIPC_BROADCAST_SUPPORT; } /* tipc_udp_addr2str - convert ip/udp address to string */ @@ -229,7 +229,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, goto out; } - if (!addr->broadcast || list_empty(&ub->rcast.list)) + if (addr->broadcast != TIPC_REPLICAST_SUPPORT) return tipc_udp_xmit(net, skb, ub, src, dst); /* Replicast, send an skb to each configured IP address */ @@ -296,7 +296,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b, else if (ntohs(addr->proto) == ETH_P_IPV6) pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); #endif - + b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT; list_add_rcu(&rcast->list, &ub->rcast.list); return 0; } @@ -681,7 +681,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, goto err; b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; - b->bcast_addr.broadcast = 1; + b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); rcu_assign_pointer(ub->bearer, b); tipc_udp_media_addr_set(&b->addr, &local); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 127656ebe7be..e2d18b9f910f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,6 +117,7 @@ #include <net/checksum.h> #include <linux/security.h> #include <linux/freezer.h> +#include <linux/file.h> struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); @@ -995,6 +996,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned int hash; struct unix_address *addr; struct hlist_head *list; + struct path path = { NULL, NULL }; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) @@ -1010,9 +1012,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; + if (sun_path[0]) { + umode_t mode = S_IFSOCK | + (SOCK_INODE(sock)->i_mode & ~current_umask()); + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + goto out; + } + } + err = mutex_lock_interruptible(&u->bindlock); if (err) - goto out; + goto out_put; err = -EINVAL; if (u->addr) @@ -1029,16 +1042,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - struct path path; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; - } addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); @@ -1065,6 +1068,9 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->bindlock); +out_put: + if (err) + path_put(&path); out: return err; } @@ -2587,6 +2593,43 @@ long unix_outq_len(struct sock *sk) } EXPORT_SYMBOL_GPL(unix_outq_len); +static int unix_open_file(struct sock *sk) +{ + struct path path; + struct file *f; + int fd; + + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + unix_state_lock(sk); + path = unix_sk(sk)->path; + if (!path.dentry) { + unix_state_unlock(sk); + return -ENOENT; + } + + path_get(&path); + unix_state_unlock(sk); + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + goto out; + + f = dentry_open(&path, O_PATH, current_cred()); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + goto out; + } + + fd_install(fd, f); +out: + path_put(&path); + + return fd; +} + static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -2605,6 +2648,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) else err = put_user(amount, (int __user *)arg); break; + case SIOCUNIXFILE: + err = unix_open_file(sk); + break; default: err = -ENOIOCTLCMD; break; diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 816c9331c8d2..d06e5015751a 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o +cfg80211-$(CONFIG_OF) += of.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o diff --git a/net/wireless/core.c b/net/wireless/core.c index 158c59ecf90a..e55e05bc4805 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -626,7 +626,8 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && (!rdev->ops->start_nan || !rdev->ops->stop_nan || - !rdev->ops->add_nan_func || !rdev->ops->del_nan_func))) + !rdev->ops->add_nan_func || !rdev->ops->del_nan_func || + !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ))))) return -EINVAL; #ifndef CONFIG_WIRELESS_WDS @@ -1142,6 +1143,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) dev->priv_flags |= IFF_DONT_BRIDGE; + INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); break; case NETDEV_GOING_DOWN: @@ -1230,6 +1233,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, #ifdef CONFIG_CFG80211_WEXT kzfree(wdev->wext.keys); #endif + flush_work(&wdev->disconnect_wk); } /* * synchronise (so that we won't find this netdev diff --git a/net/wireless/core.h b/net/wireless/core.h index af6e023020b1..58ca206982fe 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -228,6 +228,7 @@ struct cfg80211_event { size_t resp_ie_len; struct cfg80211_bss *bss; int status; /* -1 = failed; 0..65535 = status code */ + enum nl80211_timeout_reason timeout_reason; } cr; struct { const u8 *req_ie; @@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, int status, bool wextev, - struct cfg80211_bss *bss); + struct cfg80211_bss *bss, + enum nl80211_timeout_reason timeout_reason); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); int cfg80211_disconnect(struct cfg80211_registered_device *rdev, @@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *resp_ie, size_t resp_ie_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +void cfg80211_autodisconnect_wk(struct work_struct *work); /* SME implementation */ void cfg80211_conn_work(struct work_struct *work); @@ -430,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wdev_events(struct wireless_dev *wdev); +bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, + u32 center_freq_khz, u32 bw_khz); + /** * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable * @wiphy: the wiphy to validate against diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 5d453916a417..30fc6eb352bc 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -17,7 +17,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ - struct wiphy *wiphy= file->private_data; \ + struct wiphy *wiphy = file->private_data; \ char buf[buflen]; \ int res; \ \ @@ -29,14 +29,14 @@ static const struct file_operations name## _ops = { \ .read = name## _read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ -}; +} DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", - wiphy->rts_threshold) + wiphy->rts_threshold); DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", wiphy->frag_threshold); DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", - wiphy->retry_short) + wiphy->retry_short); DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", wiphy->retry_long); @@ -103,7 +103,7 @@ static const struct file_operations ht40allow_map_ops = { }; #define DEBUGFS_ADD(name) \ - debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops); + debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops) void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 4646cf5695b9..22b3d9990065 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, status_code, - status_code == WLAN_STATUS_SUCCESS, bss); + status_code == WLAN_STATUS_SUCCESS, bss, + NL80211_TIMEOUT_UNSPECIFIED); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); @@ -345,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) return 0; + if (ether_addr_equal(wdev->disconnect_bssid, bssid) || + (wdev->current_bss && + ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + wdev->conn_owner_nlportid = 0; + return rdev_deauth(rdev, dev, &req); } @@ -657,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return err; } - if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) - return -EINVAL; + if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) { + /* Allow random TA to be used with Public Action frames if the + * driver has indicated support for this. Otherwise, only allow + * the local address to be used. + */ + if (!ieee80211_is_action(mgmt->frame_control) || + mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) + return -EINVAL; + if (!wdev->current_bss && + !wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) + return -EINVAL; + if (wdev->current_bss && + !wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) + return -EINVAL; + } /* Transmit the Action frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3df85a751a85..d7f8be4e321a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015-2016 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH */ #include <linux/if.h> @@ -398,13 +398,18 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { }, [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, - [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, + [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, [NL80211_ATTR_BSSID] = { .len = ETH_ALEN }, + [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 }, + [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = { + .len = sizeof(struct nl80211_bss_select_rssi_adjust) + }, + [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -1881,6 +1886,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, } } + if (nla_put_u32(msg, NL80211_ATTR_BANDS, + rdev->wiphy.nan_supported_bands)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -3738,6 +3747,49 @@ static int nl80211_parse_beacon(struct nlattr *attrs[], return 0; } +static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, + const u8 *rates) +{ + int i; + + if (!rates) + return; + + for (i = 0; i < rates[1]; i++) { + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY) + params->ht_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) + params->vht_required = true; + } +} + +/* + * Since the nl80211 API didn't include, from the beginning, attributes about + * HT/VHT requirements/capabilities, we parse them out of the IEs for the + * benefit of drivers that rebuild IEs in the firmware. + */ +static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) +{ + const struct cfg80211_beacon_data *bcn = ¶ms->beacon; + size_t ies_len = bcn->beacon_ies_len; + const u8 *ies = bcn->beacon_ies; + const u8 *rates; + const u8 *cap; + + rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len); + nl80211_check_ap_rate_selectors(params, rates); + + rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len); + nl80211_check_ap_rate_selectors(params, rates); + + cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len); + if (cap && cap[1] >= sizeof(*params->ht_cap)) + params->ht_cap = (void *)(cap + 2); + cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len); + if (cap && cap[1] >= sizeof(*params->vht_cap)) + params->vht_cap = (void *)(cap + 2); +} + static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, struct cfg80211_ap_settings *params) { @@ -3966,6 +4018,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + nl80211_calculate_ap_params(¶ms); + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -4615,6 +4669,15 @@ int cfg80211_check_station_change(struct wiphy *wiphy, break; } + /* + * Older kernel versions ignored this attribute entirely, so don't + * reject attempts to update it but mark it as unused instead so the + * driver won't look at the data. + */ + if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && + statype != CFG80211_STA_TDLS_PEER_SETUP) + params->opmode_notif_used = false; + return 0; } EXPORT_SYMBOL(cfg80211_check_station_change); @@ -4854,6 +4917,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.local_pm = pm; } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { + params.opmode_notif_used = true; + params.opmode_notif = + nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); + } + /* Include parameters for TDLS peer (will check later) */ err = nl80211_set_station_tdls(info, ¶ms); if (err) @@ -5901,6 +5970,7 @@ do { \ break; } cfg->ht_opmode = ht_opmode; + mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); } FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout, 1, 65535, mask, @@ -6775,13 +6845,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, /* * If scan plans are not specified, - * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this + * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this * case one scan plan will be set with the specified scan * interval and infinite number of iterations. */ - if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) - return -EINVAL; - interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); if (!interval) return -EINVAL; @@ -6850,7 +6917,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, static struct cfg80211_sched_scan_request * nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, - struct nlattr **attrs) + struct nlattr **attrs, int max_match_sets) { struct cfg80211_sched_scan_request *request; struct nlattr *attr; @@ -6915,7 +6982,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF) n_match_sets = 1; - if (n_match_sets > wiphy->max_match_sets) + if (n_match_sets > max_match_sets) return ERR_PTR(-EINVAL); if (attrs[NL80211_ATTR_IE]) @@ -6953,6 +7020,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (!n_plans || n_plans > wiphy->max_sched_scan_plans) return ERR_PTR(-EINVAL); + if (!wiphy_ext_feature_isset( + wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) && + (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] || + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST])) + return ERR_PTR(-EINVAL); + request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->match_sets) * n_match_sets @@ -7159,6 +7232,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->delay = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]); + if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) { + request->relative_rssi = nla_get_s8( + attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]); + request->relative_rssi_set = true; + } + + if (request->relative_rssi_set && + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) { + struct nl80211_bss_select_rssi_adjust *rssi_adjust; + + rssi_adjust = nla_data( + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]); + request->rssi_adjust.band = rssi_adjust->band; + request->rssi_adjust.delta = rssi_adjust->delta; + if (!is_band_valid(wiphy, request->rssi_adjust.band)) { + err = -EINVAL; + goto out_free; + } + } + err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs); if (err) goto out_free; @@ -7189,7 +7282,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, return -EINPROGRESS; sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, - info->attrs); + info->attrs, + rdev->wiphy.max_match_sets); err = PTR_ERR_OR_ZERO(sched_scan_req); if (err) @@ -8053,8 +8147,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); + err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, ssid, ssid_len, &req); + + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + dev->ieee80211_ptr->conn_owner_nlportid = + info->snd_portid; + memcpy(dev->ieee80211_ptr->disconnect_bssid, + bssid, ETH_ALEN); + } + wdev_unlock(dev->ieee80211_ptr); } @@ -8533,6 +8636,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb, * so we need to offset by 1. */ phy_idx = cb->args[0] - 1; + + rdev = cfg80211_rdev_by_wiphy_idx(phy_idx); + if (!rdev) { + err = -ENOENT; + goto out_err; + } } else { struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); @@ -8547,7 +8656,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb, goto out_err; } phy_idx = rdev->wiphy_idx; - rdev = NULL; if (attrbuf[NL80211_ATTR_TESTDATA]) cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA]; @@ -8558,12 +8666,6 @@ static int nl80211_testmode_dump(struct sk_buff *skb, data_len = nla_len((void *)cb->args[1]); } - rdev = cfg80211_rdev_by_wiphy_idx(phy_idx); - if (!rdev) { - err = -ENOENT; - goto out_err; - } - if (!rdev->ops->testmode_dump) { err = -EOPNOTSUPP; goto out_err; @@ -8773,11 +8875,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } wdev_lock(dev->ieee80211_ptr); + err = cfg80211_connect(rdev, dev, &connect, connkeys, connect.prev_bssid); - wdev_unlock(dev->ieee80211_ptr); if (err) kzfree(connkeys); + + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + if (connect.bssid) + memcpy(dev->ieee80211_ptr->disconnect_bssid, + connect.bssid, ETH_ALEN); + else + memset(dev->ieee80211_ptr->disconnect_bssid, + 0, ETH_ALEN); + } + + wdev_unlock(dev->ieee80211_ptr); + return err; } @@ -9364,6 +9479,7 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = { [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 }, + [NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 }, }; static int nl80211_set_cqm_txe(struct genl_info *info, @@ -9673,6 +9789,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) return -ENOBUFS; + if (req->relative_rssi_set) { + struct nl80211_bss_select_rssi_adjust rssi_adjust; + + if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI, + req->relative_rssi)) + return -ENOBUFS; + + rssi_adjust.band = req->rssi_adjust.band; + rssi_adjust.delta = req->rssi_adjust.delta; + if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST, + sizeof(rssi_adjust), &rssi_adjust)) + return -ENOBUFS; + } + freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!freqs) return -ENOBUFS; @@ -9966,7 +10096,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, if (err) goto out; - trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb); + trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb, + wowlan->max_nd_match_sets); err = PTR_ERR_OR_ZERO(trig->nd_config); if (err) trig->nd_config = NULL; @@ -10651,15 +10782,22 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - if (!info->attrs[NL80211_ATTR_NAN_DUAL]) - return -EINVAL; - conf.master_pref = nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); if (!conf.master_pref) return -EINVAL; - conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wdev->wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf.bands = bands; + } err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -11024,9 +11162,17 @@ static int nl80211_nan_change_config(struct sk_buff *skb, changed |= CFG80211_NAN_CONF_CHANGED_PREF; } - if (info->attrs[NL80211_ATTR_NAN_DUAL]) { - conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); - changed |= CFG80211_NAN_CONF_CHANGED_DUAL; + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wdev->wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf.bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; } if (!changed) @@ -11807,9 +11953,6 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb, const struct nlattr *nla; bool enabled; - if (netif_running(dev)) - return -EBUSY; - if (!rdev->ops->set_multicast_to_unicast) return -EOPNOTSUPP; @@ -12810,7 +12953,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, return -ENOBUFS; } -static int nl80211_send_scan_msg(struct sk_buff *msg, +static int nl80211_prep_scan_msg(struct sk_buff *msg, struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 portid, u32 seq, int flags, @@ -12841,7 +12984,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg, } static int -nl80211_send_sched_scan_msg(struct sk_buff *msg, +nl80211_prep_sched_scan_msg(struct sk_buff *msg, struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 portid, u32 seq, int flags, u32 cmd) @@ -12873,7 +13016,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, + if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0, NL80211_CMD_TRIGGER_SCAN) < 0) { nlmsg_free(msg); return; @@ -12892,7 +13035,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, if (!msg) return NULL; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, + if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0, aborted ? NL80211_CMD_SCAN_ABORTED : NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); @@ -12902,8 +13045,9 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, return msg; } -void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, - struct sk_buff *msg) +/* send message created by nl80211_build_scan_msg() */ +void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { if (!msg) return; @@ -12912,25 +13056,6 @@ void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, - struct net_device *netdev) -{ - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, - NL80211_CMD_SCHED_SCAN_RESULTS) < 0) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); -} - void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd) { @@ -12940,7 +13065,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, if (!msg) return; - if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { + if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { nlmsg_free(msg); return; } @@ -13042,7 +13167,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return; @@ -13189,12 +13314,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - int status, gfp_t gfp) + int status, + enum nl80211_timeout_reason timeout_reason, + gfp_t gfp) { struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); if (!msg) return; @@ -13210,7 +13337,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : status) || - (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || + (status < 0 && + (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) || + nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) || (req_ie && nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || (resp_ie && @@ -13236,7 +13365,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); if (!msg) return; @@ -13273,7 +13402,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(100 + ie_len, GFP_KERNEL); if (!msg) return; @@ -13349,7 +13478,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, trace_cfg80211_notify_new_peer_candidate(dev, addr); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + ie_len, gfp); if (!msg) return; @@ -13720,7 +13849,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return -ENOMEM; @@ -13764,7 +13893,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return; @@ -13851,11 +13980,11 @@ static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp) void cfg80211_cqm_rssi_notify(struct net_device *dev, enum nl80211_cqm_rssi_threshold_event rssi_event, - gfp_t gfp) + s32 rssi_level, gfp_t gfp) { struct sk_buff *msg; - trace_cfg80211_cqm_rssi_notify(dev, rssi_event); + trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level); if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW && rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) @@ -13869,6 +13998,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_event)) goto nla_put_failure; + if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL, + rssi_level)) + goto nla_put_failure; + cfg80211_send_cqm(msg, gfp); return; @@ -14502,19 +14635,25 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { bool schedule_destroy_work = false; - bool schedule_scan_stop = false; struct cfg80211_sched_scan_request *sched_scan_req = rcu_dereference(rdev->sched_scan_req); if (sched_scan_req && notify->portid && - sched_scan_req->owner_nlportid == notify->portid) - schedule_scan_stop = true; + sched_scan_req->owner_nlportid == notify->portid) { + sched_scan_req->owner_nlportid = 0; + + if (rdev->ops->sched_scan_stop && + rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + schedule_work(&rdev->sched_scan_stop_wk); + } list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); if (wdev->owner_nlportid == notify->portid) schedule_destroy_work = true; + else if (wdev->conn_owner_nlportid == notify->portid) + schedule_work(&wdev->disconnect_wk); } spin_lock_bh(&rdev->beacon_registrations_lock); @@ -14539,12 +14678,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb, spin_unlock(&rdev->destroy_list_lock); schedule_work(&rdev->destroy_work); } - } else if (schedule_scan_stop) { - sched_scan_req->owner_nlportid = 0; - - if (rdev->ops->sched_scan_stop && - rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) - schedule_work(&rdev->sched_scan_stop_wk); } } @@ -14575,7 +14708,7 @@ void cfg80211_ft_event(struct net_device *netdev, if (!ft_event->target_ap) return; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL); if (!msg) return; diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 7e3821d7fcc5..e488dca87423 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, bool aborted); -void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, - struct sk_buff *msg); +void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); -void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, - struct net_device *netdev); void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, struct regulatory_request *request); @@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - int status, gfp_t gfp); + int status, + enum nl80211_timeout_reason timeout_reason, + gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, diff --git a/net/wireless/of.c b/net/wireless/of.c new file mode 100644 index 000000000000..de221f0edca5 --- /dev/null +++ b/net/wireless/of.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2017 RafaÅ‚ MiÅ‚ecki <rafal@milecki.pl> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <linux/of.h> +#include <net/cfg80211.h> +#include "core.h" + +static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy, + struct ieee80211_freq_range *freq_limits, + unsigned int n_freq_limits, + struct ieee80211_channel *chan) +{ + u32 bw = MHZ_TO_KHZ(20); + int i; + + for (i = 0; i < n_freq_limits; i++) { + struct ieee80211_freq_range *limit = &freq_limits[i]; + + if (cfg80211_does_bw_fit_range(limit, + MHZ_TO_KHZ(chan->center_freq), + bw)) + return true; + } + + return false; +} + +static void wiphy_freq_limits_apply(struct wiphy *wiphy, + struct ieee80211_freq_range *freq_limits, + unsigned int n_freq_limits) +{ + enum nl80211_band band; + int i; + + if (WARN_ON(!n_freq_limits)) + return; + + for (band = 0; band < NUM_NL80211_BANDS; band++) { + struct ieee80211_supported_band *sband = wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels; i++) { + struct ieee80211_channel *chan = &sband->channels[i]; + + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits, + n_freq_limits, + chan)) { + pr_debug("Disabling freq %d MHz as it's out of OF limits\n", + chan->center_freq); + chan->flags |= IEEE80211_CHAN_DISABLED; + } + } + } +} + +void wiphy_read_of_freq_limits(struct wiphy *wiphy) +{ + struct device *dev = wiphy_dev(wiphy); + struct device_node *np; + struct property *prop; + struct ieee80211_freq_range *freq_limits; + unsigned int n_freq_limits; + const __be32 *p; + int len, i; + int err = 0; + + if (!dev) + return; + np = dev_of_node(dev); + if (!np) + return; + + prop = of_find_property(np, "ieee80211-freq-limit", &len); + if (!prop) + return; + + if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) { + dev_err(dev, "ieee80211-freq-limit wrong format"); + return; + } + n_freq_limits = len / sizeof(u32) / 2; + + freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL); + if (!freq_limits) { + err = -ENOMEM; + goto out_kfree; + } + + p = NULL; + for (i = 0; i < n_freq_limits; i++) { + struct ieee80211_freq_range *limit = &freq_limits[i]; + + p = of_prop_next_u32(prop, p, &limit->start_freq_khz); + if (!p) { + err = -EINVAL; + goto out_kfree; + } + + p = of_prop_next_u32(prop, p, &limit->end_freq_khz); + if (!p) { + err = -EINVAL; + goto out_kfree; + } + + if (!limit->start_freq_khz || + !limit->end_freq_khz || + limit->start_freq_khz >= limit->end_freq_khz) { + err = -EINVAL; + goto out_kfree; + } + } + + wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits); + +out_kfree: + kfree(freq_limits); + if (err) + dev_err(dev, "Failed to get limits: %d\n", err); +} +EXPORT_SYMBOL(wiphy_read_of_freq_limits); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5dbac3749738..753efcd51fa3 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd) return true; } -static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range, - u32 center_freq_khz, u32 bw_khz) -{ - u32 start_freq_khz, end_freq_khz; - - start_freq_khz = center_freq_khz - (bw_khz/2); - end_freq_khz = center_freq_khz + (bw_khz/2); - - if (start_freq_khz >= freq_range->start_freq_khz && - end_freq_khz <= freq_range->end_freq_khz) - return true; - - return false; -} - /** * freq_in_rule_band - tells us if a frequency is in a frequency band * @freq_range: frequency rule we want to query @@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, center_freq, bw); + bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; @@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(10))) + if (!cfg80211_does_bw_fit_range(freq_range, + MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(20))) + if (!cfg80211_does_bw_fit_range(freq_range, + MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(10)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 35ad69fd0838..21be56b3128e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, ASSERT_RTNL(); if (rdev->scan_msg) { - nl80211_send_scan_result(rdev, rdev->scan_msg); + nl80211_send_scan_msg(rdev, rdev->scan_msg); rdev->scan_msg = NULL; return; } @@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (!send_message) rdev->scan_msg = msg; else - nl80211_send_scan_result(rdev, msg); + nl80211_send_scan_msg(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk) spin_unlock_bh(&rdev->bss_lock); request->scan_start = jiffies; } - nl80211_send_sched_scan_results(rdev, request->dev); + nl80211_send_sched_scan(rdev, request->dev, + NL80211_CMD_SCHED_SCAN_RESULTS); } rtnl_unlock(); @@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, else rcu_assign_pointer(tmp.pub.beacon_ies, ies); rcu_assign_pointer(tmp.pub.ies, ies); - + memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; tmp.pub.scan_width = data->scan_width; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 5e0d19380302..b347e63d7aaa 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -34,10 +34,11 @@ struct cfg80211_conn { CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, - CFG80211_CONN_AUTH_FAILED, + CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, + CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, @@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) return err; } -static int cfg80211_conn_do_work(struct wireless_dev *wdev) +static int cfg80211_conn_do_work(struct wireless_dev *wdev, + enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; @@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) NULL, 0, params->key, params->key_len, params->key_idx, NULL, 0); - case CFG80211_CONN_AUTH_FAILED: + case CFG80211_CONN_AUTH_FAILED_TIMEOUT: + *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) @@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) WLAN_REASON_DEAUTH_LEAVING, false); return err; + case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: + *treason = NL80211_TIMEOUT_ASSOC; + /* fall through */ case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, @@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work) container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; + enum nl80211_timeout_reason treason; rtnl_lock(); @@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work) memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } - if (cfg80211_conn_do_work(wdev)) { + treason = NL80211_TIMEOUT_UNSPECIFIED; + if (cfg80211_conn_do_work(wdev, &treason)) { __cfg80211_connect_result( wdev->netdev, bssid, - NULL, 0, NULL, 0, -1, false, NULL); + NULL, 0, NULL, 0, -1, false, NULL, + treason); } wdev_unlock(wdev); } @@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) } else if (status_code != WLAN_STATUS_SUCCESS) { __cfg80211_connect_result(wdev->netdev, mgmt->bssid, NULL, 0, NULL, 0, - status_code, false, NULL); + status_code, false, NULL, + NL80211_TIMEOUT_UNSPECIFIED); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); @@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) if (!wdev->conn) return; - wdev->conn->state = CFG80211_CONN_AUTH_FAILED; + wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } @@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) if (!wdev->conn) return; - wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; + wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } @@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, /* we're good if we have a matching bss struct */ if (bss) { - err = cfg80211_conn_do_work(wdev); + enum nl80211_timeout_reason treason; + + err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ @@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, int status, bool wextev, - struct cfg80211_bss *bss) + struct cfg80211_bss *bss, + enum nl80211_timeout_reason timeout_reason) { struct wireless_dev *wdev = dev->ieee80211_ptr; const u8 *country_ie; @@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, bssid, req_ie, req_ie_len, resp_ie, resp_ie_len, - status, GFP_KERNEL); + status, timeout_reason, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (wextev) { @@ -727,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, kzfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; + wdev->conn_owner_nlportid = 0; if (bss) { cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wdev->wiphy, bss); @@ -770,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, struct cfg80211_bss *bss, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, int status, gfp_t gfp) + size_t resp_ie_len, int status, gfp_t gfp, + enum nl80211_timeout_reason timeout_reason) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -810,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); ev->cr.bss = bss; ev->cr.status = status; + ev->cr.timeout_reason = timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); @@ -955,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, wdev->current_bss = NULL; wdev->ssid_len = 0; + wdev->conn_owner_nlportid = 0; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); @@ -1098,6 +1115,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, kzfree(wdev->connect_keys); wdev->connect_keys = NULL; + wdev->conn_owner_nlportid = 0; + if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) @@ -1107,3 +1126,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, return err; } + +/* + * Used to clean up after the connection / connection attempt owner socket + * disconnects + */ +void cfg80211_autodisconnect_wk(struct work_struct *work) +{ + struct wireless_dev *wdev = + container_of(work, struct wireless_dev, disconnect_wk); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + + wdev_lock(wdev); + + if (wdev->conn_owner_nlportid) { + /* + * Use disconnect_bssid if still connecting and ops->disconnect + * not implemented. Otherwise we can use cfg80211_disconnect. + */ + if (rdev->ops->disconnect || wdev->current_bss) + cfg80211_disconnect(rdev, wdev->netdev, + WLAN_REASON_DEAUTH_LEAVING, true); + else + cfg80211_mlme_deauth(rdev, wdev->netdev, + wdev->disconnect_bssid, NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, false); + } + + wdev_unlock(wdev); +} diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 14b3f007826d..16b6b5988be9 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, - char *buf) { + char *buf) +{ struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; - return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); + + return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ea1b47e04fa4..776e80cef9b4 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1915,18 +1915,18 @@ TRACE_EVENT(rdev_start_nan, WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) - __field(u8, dual); + __field(u8, bands); ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->master_pref = conf->master_pref; - __entry->dual = conf->dual; + __entry->bands = conf->bands; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT - ", master preference: %u, dual: %d", + ", master preference: %u, bands: 0x%0x", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, - __entry->dual) + __entry->bands) ); TRACE_EVENT(rdev_nan_change_conf, @@ -1937,20 +1937,20 @@ TRACE_EVENT(rdev_nan_change_conf, WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) - __field(u8, dual); + __field(u8, bands); __field(u32, changes); ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->master_pref = conf->master_pref; - __entry->dual = conf->dual; + __entry->bands = conf->bands; __entry->changes = changes; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT - ", master preference: %u, dual: %d, changes: %x", + ", master preference: %u, bands: 0x%0x, changes: %x", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, - __entry->dual, __entry->changes) + __entry->bands, __entry->changes) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, @@ -2490,18 +2490,21 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, TRACE_EVENT(cfg80211_cqm_rssi_notify, TP_PROTO(struct net_device *netdev, - enum nl80211_cqm_rssi_threshold_event rssi_event), - TP_ARGS(netdev, rssi_event), + enum nl80211_cqm_rssi_threshold_event rssi_event, + s32 rssi_level), + TP_ARGS(netdev, rssi_event, rssi_level), TP_STRUCT__entry( NETDEV_ENTRY __field(enum nl80211_cqm_rssi_threshold_event, rssi_event) + __field(s32, rssi_level) ), TP_fast_assign( NETDEV_ASSIGN; __entry->rssi_event = rssi_event; + __entry->rssi_level = rssi_level; ), - TP_printk(NETDEV_PR_FMT ", rssi event: %d", - NETDEV_PR_ARG, __entry->rssi_event) + TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d", + NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level) ); TRACE_EVENT(cfg80211_reg_can_beacon, diff --git a/net/wireless/util.c b/net/wireless/util.c index e9d040d29846..68e5f2ecee1a 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -114,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq) } EXPORT_SYMBOL(ieee80211_frequency_to_channel); -struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, - int freq) +struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; @@ -135,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, return NULL; } -EXPORT_SYMBOL(__ieee80211_get_channel); +EXPORT_SYMBOL(ieee80211_get_channel); -static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, - enum nl80211_band band) +static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; - switch (band) { + switch (sband->band) { case NL80211_BAND_5GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { @@ -192,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NUM_NL80211_BANDS: + default: WARN_ON(1); break; } @@ -203,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy) for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) - set_mandatory_flags_band(wiphy->bands[band], band); + set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) @@ -619,8 +618,6 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) return -ENOMEM; - - skb->truesize += head_need; } if (encaps_data) { @@ -952,7 +949,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->cr.resp_ie, ev->cr.resp_ie_len, ev->cr.status, ev->cr.status == WLAN_STATUS_SUCCESS, - ev->cr.bss); + ev->cr.bss, ev->cr.timeout_reason); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, @@ -1848,6 +1845,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f) } EXPORT_SYMBOL(cfg80211_free_nan_func); +bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, + u32 center_freq_khz, u32 bw_khz) +{ + u32 start_freq_khz, end_freq_khz; + + start_freq_khz = center_freq_khz - (bw_khz / 2); + end_freq_khz = center_freq_khz + (bw_khz / 2); + + if (start_freq_khz >= freq_range->start_freq_khz && + end_freq_khz <= freq_range->end_freq_khz) + return true; + + return false; +} + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 6250b1cfcde5..1a4db6790e20 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, return ret; } #endif + +char *iwe_stream_add_event(struct iw_request_info *info, char *stream, + char *ends, struct iw_event *iwe, int event_len) +{ + int lcp_len = iwe_stream_lcp_len(info); + + event_len = iwe_stream_event_len_adjust(info, event_len); + + /* Check if it's possible */ + if (likely((stream + event_len) < ends)) { + iwe->len = event_len; + /* Beware of alignement issues on 64 bits */ + memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); + memcpy(stream + lcp_len, &iwe->u, + event_len - lcp_len); + stream += event_len; + } + + return stream; +} +EXPORT_SYMBOL(iwe_stream_add_event); + +char *iwe_stream_add_point(struct iw_request_info *info, char *stream, + char *ends, struct iw_event *iwe, char *extra) +{ + int event_len = iwe_stream_point_len(info) + iwe->u.data.length; + int point_len = iwe_stream_point_len(info); + int lcp_len = iwe_stream_lcp_len(info); + + /* Check if it's possible */ + if (likely((stream + event_len) < ends)) { + iwe->len = event_len; + memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); + memcpy(stream + lcp_len, + ((char *) &iwe->u) + IW_EV_POINT_OFF, + IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); + if (iwe->u.data.length && extra) + memcpy(stream + point_len, extra, iwe->u.data.length); + stream += event_len; + } + + return stream; +} +EXPORT_SYMBOL(iwe_stream_add_point); + +char *iwe_stream_add_value(struct iw_request_info *info, char *event, + char *value, char *ends, struct iw_event *iwe, + int event_len) +{ + int lcp_len = iwe_stream_lcp_len(info); + + /* Don't duplicate LCP */ + event_len -= IW_EV_LCP_LEN; + + /* Check if it's possible */ + if (likely((value + event_len) < ends)) { + /* Add new value */ + memcpy(value, &iwe->u, event_len); + value += event_len; + /* Patch LCP */ + iwe->len = value - event; + memcpy(event, (char *) iwe, lcp_len); + } + + return value; +} +EXPORT_SYMBOL(iwe_stream_add_value); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 995163830a61..c434f193f39a 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, goto out; } - wdev->wext.connect.channel = chan; - - /* - * SSID is not set, we just want to switch monitor channel, - * this is really just backward compatibility, if the SSID - * is set then we use the channel to select the BSS to use - * to connect to instead. If we were connected on another - * channel we disconnected above and reconnect below. - */ - if (chan && !wdev->wext.connect.ssid_len) { - struct cfg80211_chan_def chandef = { - .width = NL80211_CHAN_WIDTH_20_NOHT, - .center_freq1 = freq, - }; - - chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (chandef.chan) - err = cfg80211_set_monitor_channel(rdev, &chandef); - else - err = -EINVAL; - goto out; - } - err = cfg80211_mgd_wext_connect(rdev, wdev); out: wdev_unlock(wdev); diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index bda1a13628a8..286ed25c1a69 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -4,6 +4,11 @@ config XFRM bool depends on NET + select GRO_CELLS + +config XFRM_OFFLOAD + bool + depends on XFRM config XFRM_ALGO tristate diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6e3f0254d8a1..46bdb4fbed0b 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -19,16 +19,18 @@ static struct kmem_cache *secpath_cachep __read_mostly; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); -static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO]; +static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; -int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) +static struct gro_cells gro_cells; +static struct net_device xfrm_napi_dev; + +int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + if (WARN_ON(afinfo->family >= ARRAY_SIZE(xfrm_input_afinfo))) return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) err = -EEXIST; @@ -39,14 +41,10 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_input_register_afinfo); -int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo) +int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) - return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) { if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo)) @@ -60,12 +58,13 @@ int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); -static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) +static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) { - struct xfrm_input_afinfo *afinfo; + const struct xfrm_input_afinfo *afinfo; - if (unlikely(family >= NPROTO)) + if (WARN_ON_ONCE(family >= ARRAY_SIZE(xfrm_input_afinfo))) return NULL; + rcu_read_lock(); afinfo = rcu_dereference(xfrm_input_afinfo[family]); if (unlikely(!afinfo)) @@ -73,22 +72,17 @@ static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family) return afinfo; } -static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo) -{ - rcu_read_unlock(); -} - static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { int ret; - struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); + const struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family); if (!afinfo) return -EAFNOSUPPORT; ret = afinfo->callback(skb, protocol, err); - xfrm_input_put_afinfo(afinfo); + rcu_read_unlock(); return ret; } @@ -111,6 +105,8 @@ struct sec_path *secpath_dup(struct sec_path *src) return NULL; sp->len = 0; + sp->olen = 0; + if (src) { int i; @@ -123,6 +119,24 @@ struct sec_path *secpath_dup(struct sec_path *src) } EXPORT_SYMBOL(secpath_dup); +int secpath_set(struct sk_buff *skb) +{ + struct sec_path *sp; + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + sp = secpath_dup(skb->sp); + if (!sp) + return -ENOMEM; + + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + return 0; +} +EXPORT_SYMBOL(secpath_set); + /* Fetch spi and seq from ipsec header */ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) @@ -158,6 +172,7 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); return 0; } +EXPORT_SYMBOL(xfrm_parse_spi); int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { @@ -192,14 +207,23 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) unsigned int family; int decaps = 0; int async = 0; + struct xfrm_offload *xo; + bool xfrm_gro = false; - /* A negative encap_type indicates async resumption. */ if (encap_type < 0) { - async = 1; x = xfrm_input_state(skb); - seq = XFRM_SKB_CB(skb)->seq.input.low; family = x->outer_mode->afinfo->family; - goto resume; + + /* An encap_type of -1 indicates async resumption. */ + if (encap_type == -1) { + async = 1; + seq = XFRM_SKB_CB(skb)->seq.input.low; + goto resume; + } + /* encap_type < -1 indicates a GRO call. */ + encap_type = 0; + seq = XFRM_SPI_SKB_CB(skb)->seq; + goto lock; } daddr = (xfrm_address_t *)(skb_network_header(skb) + @@ -218,18 +242,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) break; } - /* Allocate new secpath or COW existing one. */ - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - - sp = secpath_dup(skb->sp); - if (!sp) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); - goto drop; - } - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; + err = secpath_set(skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; } seq = 0; @@ -253,6 +269,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; +lock: spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { @@ -371,10 +388,21 @@ resume: if (decaps) { skb_dst_drop(skb); - netif_rx(skb); + gro_cells_receive(&gro_cells, skb); return 0; } else { - return x->inner_mode->afinfo->transport_finish(skb, async); + xo = xfrm_offload(skb); + if (xo) + xfrm_gro = xo->flags & XFRM_GRO; + + err = x->inner_mode->afinfo->transport_finish(skb, async); + if (xfrm_gro) { + skb_dst_drop(skb); + gro_cells_receive(&gro_cells, skb); + return err; + } + + return err; } drop_unlock: @@ -394,6 +422,13 @@ EXPORT_SYMBOL(xfrm_input_resume); void __init xfrm_input_init(void) { + int err; + + init_dummy_netdev(&xfrm_napi_dev); + err = gro_cells_init(&gro_cells, &xfrm_napi_dev); + if (err) + gro_cells.cells = NULL; + secpath_cachep = kmem_cache_create("secpath_cache", sizeof(struct sec_path), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 637387bbaaea..8ba29fe58352 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -246,10 +246,8 @@ void xfrm_local_error(struct sk_buff *skb, int mtu) return; afinfo = xfrm_state_get_afinfo(proto); - if (!afinfo) - return; - - afinfo->local_error(skb, mtu); - xfrm_state_put_afinfo(afinfo); + if (afinfo) + afinfo->local_error(skb, mtu); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xfrm_local_error); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 177e208e8ff5..0806dccdf507 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,7 +45,7 @@ struct xfrm_flo { }; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); -static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] +static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __read_mostly; @@ -103,11 +103,11 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl return false; } -static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) +static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { - struct xfrm_policy_afinfo *afinfo; + const struct xfrm_policy_afinfo *afinfo; - if (unlikely(family >= NPROTO)) + if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); @@ -116,18 +116,13 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) return afinfo; } -static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) -{ - rcu_read_unlock(); -} - static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family) { - struct xfrm_policy_afinfo *afinfo; + const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); @@ -136,7 +131,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return dst; } @@ -330,7 +325,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) } EXPORT_SYMBOL(xfrm_policy_destroy); -/* Rule must be locked. Release descentant resources, announce +/* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ @@ -1431,12 +1426,12 @@ xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, xfrm_address_t *remote, unsigned short family) { int err; - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(net, oif, local, remote); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return err; } @@ -1538,21 +1533,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, } -/* Check that the bundle accepts the flow and its components are - * still valid. - */ - -static inline int xfrm_get_tos(const struct flowi *fl, int family) +static int xfrm_get_tos(const struct flowi *fl, int family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - int tos; - - if (!afinfo) - return -EINVAL; + const struct xfrm_policy_afinfo *afinfo; + int tos = 0; - tos = afinfo->get_tos(fl); + afinfo = xfrm_policy_get_afinfo(family); + tos = afinfo ? afinfo->get_tos(fl) : 0; - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return tos; } @@ -1609,7 +1598,7 @@ static const struct flow_cache_ops xfrm_bundle_fc_ops = { static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; @@ -1638,7 +1627,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) } else xdst = ERR_PTR(-ENOBUFS); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return xdst; } @@ -1646,7 +1635,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { - struct xfrm_policy_afinfo *afinfo = + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(dst->ops->family); int err; @@ -1655,7 +1644,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, err = afinfo->init_path(path, dst, nfheader_len); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return err; } @@ -1663,7 +1652,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct xfrm_policy_afinfo *afinfo = + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; @@ -1672,7 +1661,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, err = afinfo->fill_dst(xdst, dev, fl); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return err; } @@ -1705,9 +1694,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xfrm_flowi_addr_get(fl, &saddr, &daddr, family); tos = xfrm_get_tos(fl, family); - err = tos; - if (tos < 0) - goto put_states; dst_hold(dst); @@ -2215,7 +2201,7 @@ error: static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { @@ -2224,7 +2210,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, } else { ret = afinfo->blackhole_route(net, dst_orig); } - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return ret; } @@ -2466,7 +2452,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); int err; if (unlikely(afinfo == NULL)) @@ -2474,7 +2460,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, afinfo->decode_session(skb, fl, reverse); err = security_xfrm_decode_session(skb, &fl->flowi_secid); - xfrm_policy_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(__xfrm_decode_session); @@ -2742,10 +2728,11 @@ void xfrm_garbage_collect(struct net *net) } EXPORT_SYMBOL(xfrm_garbage_collect); -static void xfrm_garbage_collect_deferred(struct net *net) +void xfrm_garbage_collect_deferred(struct net *net) { flow_cache_flush_deferred(net); } +EXPORT_SYMBOL(xfrm_garbage_collect_deferred); static void xfrm_init_pmtu(struct dst_entry *dst) { @@ -2849,22 +2836,52 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst) return mtu ? : dst_mtu(dst->path); } +static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, + const void *daddr) +{ + const struct dst_entry *path = dst->path; + + for (; dst != path; dst = dst->child) { + const struct xfrm_state *xfrm = dst->xfrm; + + if (xfrm->props.mode == XFRM_MODE_TRANSPORT) + continue; + if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) + daddr = xfrm->coaddr; + else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) + daddr = &xfrm->id.daddr; + } + return daddr; +} + static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - return dst->path->ops->neigh_lookup(dst, skb, daddr); + const struct dst_entry *path = dst->path; + + if (!skb) + daddr = xfrm_get_dst_nexthop(dst, daddr); + return path->ops->neigh_lookup(path, skb, daddr); +} + +static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + const struct dst_entry *path = dst->path; + + daddr = xfrm_get_dst_nexthop(dst, daddr); + path->ops->confirm_neigh(path, daddr); } -int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) +int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + + if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; + spin_lock(&xfrm_policy_afinfo_lock); - if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) + if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; @@ -2882,9 +2899,9 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; - if (likely(afinfo->garbage_collect == NULL)) - afinfo->garbage_collect = xfrm_garbage_collect_deferred; - rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); + if (likely(!dst_ops->confirm_neigh)) + dst_ops->confirm_neigh = xfrm_confirm_neigh; + rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); @@ -2892,34 +2909,24 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_register_afinfo); -int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) +void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { - int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) - return -EAFNOSUPPORT; - spin_lock(&xfrm_policy_afinfo_lock); - if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { - if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) - err = -EINVAL; - else - RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family], - NULL); + struct dst_ops *dst_ops = afinfo->dst_ops; + int i; + + for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { + if (xfrm_policy_afinfo[i] != afinfo) + continue; + RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); + break; } - spin_unlock(&xfrm_policy_afinfo_lock); - if (!err) { - struct dst_ops *dst_ops = afinfo->dst_ops; - synchronize_rcu(); + synchronize_rcu(); - dst_ops->kmem_cachep = NULL; - dst_ops->check = NULL; - dst_ops->negative_advice = NULL; - dst_ops->link_failure = NULL; - afinfo->garbage_collect = NULL; - } - return err; + dst_ops->kmem_cachep = NULL; + dst_ops->check = NULL; + dst_ops->negative_advice = NULL; + dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 64e3c82eedf6..5a597dbbe564 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -192,7 +192,7 @@ int xfrm_register_type(const struct xfrm_type *type, unsigned short family) else err = -EEXIST; spin_unlock_bh(&xfrm_type_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); @@ -213,7 +213,7 @@ int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) else typemap[type->proto] = NULL; spin_unlock_bh(&xfrm_type_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -231,17 +231,18 @@ retry: return NULL; typemap = afinfo->type_map; - type = typemap[proto]; + type = READ_ONCE(typemap[proto]); if (unlikely(type && !try_module_get(type->owner))) type = NULL; + + rcu_read_unlock(); + if (!type && !modload_attempted) { - xfrm_state_put_afinfo(afinfo); request_module("xfrm-type-%d-%d", family, proto); modload_attempted = 1; goto retry; } - xfrm_state_put_afinfo(afinfo); return type; } @@ -280,7 +281,7 @@ int xfrm_register_mode(struct xfrm_mode *mode, int family) out: spin_unlock_bh(&xfrm_mode_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_mode); @@ -308,7 +309,7 @@ int xfrm_unregister_mode(struct xfrm_mode *mode, int family) } spin_unlock_bh(&xfrm_mode_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_unregister_mode); @@ -327,17 +328,17 @@ retry: if (unlikely(afinfo == NULL)) return NULL; - mode = afinfo->mode_map[encap]; + mode = READ_ONCE(afinfo->mode_map[encap]); if (unlikely(mode && !try_module_get(mode->owner))) mode = NULL; + + rcu_read_unlock(); if (!mode && !modload_attempted) { - xfrm_state_put_afinfo(afinfo); request_module("xfrm-mode-%d-%d", family, encap); modload_attempted = 1; goto retry; } - xfrm_state_put_afinfo(afinfo); return mode; } @@ -409,7 +410,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) if (x->xflags & XFRM_SOFT_EXPIRE) { /* enter hard expire without soft expire first?! * setting a new date could trigger this. - * workarbound: fix x->curflt.add_time by below: + * workaround: fix x->curflt.add_time by below: */ x->curlft.add_time = now - x->saved_tmo - 1; tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; @@ -639,26 +640,25 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) } EXPORT_SYMBOL(xfrm_sad_getinfo); -static int +static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family); + if (!afinfo) - return -1; + return; + afinfo->init_tempsel(&x->sel, fl); if (family != tmpl->encap_family) { - xfrm_state_put_afinfo(afinfo); - afinfo = xfrm_state_get_afinfo(tmpl->encap_family); + afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family); if (!afinfo) - return -1; + return; } afinfo->init_temprop(x, tmpl, daddr, saddr); - xfrm_state_put_afinfo(afinfo); - return 0; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, @@ -1474,7 +1474,7 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); spin_unlock_bh(&net->xfrm.xfrm_state_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_tmpl_sort); @@ -1494,7 +1494,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); spin_unlock_bh(&net->xfrm.xfrm_state_lock); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_state_sort); @@ -1932,10 +1932,10 @@ EXPORT_SYMBOL(xfrm_unregister_km); int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + + if (WARN_ON(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -EEXIST; @@ -1948,14 +1948,14 @@ EXPORT_SYMBOL(xfrm_state_register_afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) { - int err = 0; - if (unlikely(afinfo == NULL)) - return -EINVAL; - if (unlikely(afinfo->family >= NPROTO)) + int err = 0, family = afinfo->family; + + if (WARN_ON(family >= NPROTO)) return -EAFNOSUPPORT; + spin_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { - if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) + if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo) err = -EINVAL; else RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); @@ -1966,6 +1966,14 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); +struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family) +{ + if (unlikely(family >= NPROTO)) + return NULL; + + return rcu_dereference(xfrm_state_afinfo[family]); +} + struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; @@ -1978,11 +1986,6 @@ struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) return afinfo; } -void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) -{ - rcu_read_unlock(); -} - /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ void xfrm_state_delete_tunnel(struct xfrm_state *x) { @@ -2000,16 +2003,13 @@ EXPORT_SYMBOL(xfrm_state_delete_tunnel); int xfrm_state_mtu(struct xfrm_state *x, int mtu) { - int res; + const struct xfrm_type *type = READ_ONCE(x->type); - spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_VALID && - x->type && x->type->get_mtu) - res = x->type->get_mtu(x, mtu); - else - res = mtu - x->props.header_len; - spin_unlock_bh(&x->lock); - return res; + type && type->get_mtu) + return type->get_mtu(x, mtu); + + return mtu - x->props.header_len; } int __xfrm_init_state(struct xfrm_state *x, bool init_replay) @@ -2028,7 +2028,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay) if (afinfo->init_flags) err = afinfo->init_flags(x); - xfrm_state_put_afinfo(afinfo); + rcu_read_unlock(); if (err) goto error; |