From e9a21949b79414dda42a017855b288901c07e613 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <luciano.coelho@intel.com>
Date: Wed, 8 Oct 2014 09:48:36 +0300
Subject: mac80211: add extended channel switching capability if the driver
 supports CSA

The Extended Channel Switching capability bit in the extended
capabilities element must be set if the driver supports CSA on
non-beaconing interfaces.

Since this capability needs to be set during driver registration, the
extended_capabiliities global variable needs to be moved to the local
structure so that it can be modified.

Signed-off-by: Luciano Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b1be39c76931..5fab17b382b5 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1998,6 +1998,11 @@ enum ieee80211_tdls_actioncode {
 	WLAN_TDLS_DISCOVERY_REQUEST = 10,
 };
 
+/* Extended Channel Switching capability to be set in the 1st byte of
+ * the @WLAN_EID_EXT_CAPABILITY information element
+ */
+#define WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING	BIT(2)
+
 /* Interworking capabilities are set in 7th bit of 4th byte of the
  * @WLAN_EID_EXT_CAPABILITY information element
  */
-- 
cgit v1.2.3-58-ga151


From 6b358aedced8180830727258718c3916bef3e249 Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Wed, 22 Oct 2014 20:26:44 +0200
Subject: phy: marvell: Add support for 88E3016 FastEthernet PHY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Marvell 88E3016 is a FastEthernet PHY that also can be found in Marvell
Berlin SoCs as integrated PHY.

Tested-by: Antoine Ténart <antoine.tenart@free-electrons.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c   | 46 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/marvell_phy.h |  1 +
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index bd37e45c89c0..d2b2f2f795d5 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -118,6 +118,9 @@
 
 #define MII_M1116R_CONTROL_REG_MAC	21
 
+#define MII_88E3016_PHY_SPEC_CTRL	0x10
+#define MII_88E3016_DISABLE_SCRAMBLER	0x0200
+#define MII_88E3016_AUTO_MDIX_CROSSOVER	0x0030
 
 MODULE_DESCRIPTION("Marvell PHY driver");
 MODULE_AUTHOR("Andy Fleming");
@@ -434,6 +437,25 @@ static int m88e1116r_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int m88e3016_config_init(struct phy_device *phydev)
+{
+	int reg;
+
+	/* Enable Scrambler and Auto-Crossover */
+	reg = phy_read(phydev, MII_88E3016_PHY_SPEC_CTRL);
+	if (reg < 0)
+		return reg;
+
+	reg &= ~MII_88E3016_DISABLE_SCRAMBLER;
+	reg |= MII_88E3016_AUTO_MDIX_CROSSOVER;
+
+	reg = phy_write(phydev, MII_88E3016_PHY_SPEC_CTRL, reg);
+	if (reg < 0)
+		return reg;
+
+	return 0;
+}
+
 static int m88e1111_config_init(struct phy_device *phydev)
 {
 	int err;
@@ -770,6 +792,12 @@ static int marvell_read_status(struct phy_device *phydev)
 	return 0;
 }
 
+static int marvell_aneg_done(struct phy_device *phydev)
+{
+	int retval = phy_read(phydev, MII_M1011_PHY_STATUS);
+	return (retval < 0) ? retval : (retval & MII_M1011_PHY_STATUS_RESOLVED);
+}
+
 static int m88e1121_did_interrupt(struct phy_device *phydev)
 {
 	int imask;
@@ -1050,6 +1078,23 @@ static struct phy_driver marvell_drivers[] = {
 		.suspend = &genphy_suspend,
 		.driver = { .owner = THIS_MODULE },
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E3016,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E3016",
+		.features = PHY_BASIC_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.config_aneg = &genphy_config_aneg,
+		.config_init = &m88e3016_config_init,
+		.aneg_done = &marvell_aneg_done,
+		.read_status = &marvell_read_status,
+		.ack_interrupt = &marvell_ack_interrupt,
+		.config_intr = &marvell_config_intr,
+		.did_interrupt = &m88e1121_did_interrupt,
+		.resume = &genphy_resume,
+		.suspend = &genphy_suspend,
+		.driver = { .owner = THIS_MODULE },
+	},
 };
 
 static int __init marvell_init(void)
@@ -1079,6 +1124,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1318S, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1116R, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1510, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
 	{ }
 };
 
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 8e9a029e093d..e6982ac3200d 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -16,6 +16,7 @@
 #define MARVELL_PHY_ID_88E1318S		0x01410e90
 #define MARVELL_PHY_ID_88E1116R		0x01410e40
 #define MARVELL_PHY_ID_88E1510		0x01410dd0
+#define MARVELL_PHY_ID_88E3016		0x01410e60
 
 /* struct phy_device dev_flags definitions */
 #define MARVELL_PHY_M1145_FLAGS_RESISTANCE	0x00000001
-- 
cgit v1.2.3-58-ga151


From e7de17abeda24d8acc316b2e07bd969d03099eea Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Wed, 22 Oct 2014 20:26:45 +0200
Subject: net: pxa168_eth: Provide phy_interface mode on platform_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PXA168 Ethernet IP support MII and RMII connection to its PHY.
Currently, pxa168 platform_data does not provide a way to pass that
and there is one user of pxa168 platform_data (mach-mmp/gplug).
Given the pinctrl settings of gplug it uses RMII, so add and pass
a corresponding phy_interface_t.

Tested-by: Antoine Ténart <antoine.tenart@free-electrons.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-mmp/gplugd.c | 2 ++
 include/linux/pxa168_eth.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-mmp/gplugd.c b/arch/arm/mach-mmp/gplugd.c
index d81b2475e67e..3b5794cd0357 100644
--- a/arch/arm/mach-mmp/gplugd.c
+++ b/arch/arm/mach-mmp/gplugd.c
@@ -12,6 +12,7 @@
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
 #include <linux/gpio-pxa.h>
+#include <linux/phy.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
@@ -158,6 +159,7 @@ struct pxa168_eth_platform_data gplugd_eth_platform_data = {
 	.port_number = 0,
 	.phy_addr    = 0,
 	.speed       = 0, /* Autonagotiation */
+	.intf        = PHY_INTERFACE_MODE_RMII,
 	.init        = gplugd_eth_init,
 };
 
diff --git a/include/linux/pxa168_eth.h b/include/linux/pxa168_eth.h
index 18d75e795606..37c381120bc8 100644
--- a/include/linux/pxa168_eth.h
+++ b/include/linux/pxa168_eth.h
@@ -13,6 +13,7 @@ struct pxa168_eth_platform_data {
 	 */
 	int	speed;		/* 0, SPEED_10, SPEED_100 */
 	int	duplex;		/* DUPLEX_HALF or DUPLEX_FULL */
+	phy_interface_t intf;
 
 	/*
 	 * Override default RX/TX queue sizes if nonzero.
-- 
cgit v1.2.3-58-ga151


From b3020f0a35fc431f7acf3fba9a5b7376d79932e5 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sat, 25 Oct 2014 05:25:07 +0200
Subject: ieee802154: mac802154: remove FSF address

This patch removes the FSF address in files which belongs to ieee802154
and mac802154.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Cc: Alan Ott <alan@signal11.us>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c | 4 ----
 drivers/net/ieee802154/fakelb.c    | 4 ----
 drivers/net/ieee802154/mrf24j40.c  | 4 ----
 include/linux/nl802154.h           | 4 ----
 include/net/af_ieee802154.h        | 4 ----
 include/net/ieee802154.h           | 4 ----
 include/net/ieee802154_netdev.h    | 4 ----
 include/net/mac802154.h            | 3 ---
 include/net/nl802154.h             | 4 ----
 include/net/wpan-phy.h             | 4 ----
 net/ieee802154/af802154.h          | 4 ----
 net/ieee802154/af_ieee802154.c     | 4 ----
 net/ieee802154/dgram.c             | 4 ----
 net/ieee802154/ieee802154.h        | 4 ----
 net/ieee802154/netlink.c           | 4 ----
 net/ieee802154/nl-mac.c            | 4 ----
 net/ieee802154/nl-phy.c            | 4 ----
 net/ieee802154/nl_policy.c         | 4 ----
 net/ieee802154/raw.c               | 4 ----
 net/ieee802154/wpan-class.c        | 4 ----
 net/mac802154/ieee802154_dev.c     | 4 ----
 net/mac802154/mac802154.h          | 4 ----
 net/mac802154/mac_cmd.c            | 4 ----
 net/mac802154/mib.c                | 4 ----
 net/mac802154/monitor.c            | 4 ----
 net/mac802154/rx.c                 | 4 ----
 net/mac802154/tx.c                 | 4 ----
 net/mac802154/wpan.c               | 4 ----
 28 files changed, 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 6ebd665892d8..b8b1e40feeb8 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
diff --git a/drivers/net/ieee802154/fakelb.c b/drivers/net/ieee802154/fakelb.c
index 27d83207d24c..e4b1b1f66bd2 100644
--- a/drivers/net/ieee802154/fakelb.c
+++ b/drivers/net/ieee802154/fakelb.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index b1e73bc6e398..ba9a3960d26d 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/spi/spi.h>
diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index 20163b9a0eae..167342c2ce6b 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 
 #ifndef NL802154_H
diff --git a/include/net/af_ieee802154.h b/include/net/af_ieee802154.h
index 085940f7eeec..7d38e2ffd256 100644
--- a/include/net/af_ieee802154.h
+++ b/include/net/af_ieee802154.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/include/net/ieee802154.h b/include/net/ieee802154.h
index 0aa7122e8f15..4db4e320b2f5 100644
--- a/include/net/ieee802154.h
+++ b/include/net/ieee802154.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
  * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index 3b53c8e405e4..f87420689d70 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
  * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 2e67cdd19cdc..f95b98ec50a8 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -12,9 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 #ifndef NET_MAC802154_H
 #define NET_MAC802154_H
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index b23548e04098..b5cdea29d9d9 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 
 #ifndef IEEE802154_NL_H
diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h
index 10ab0fc6d4f7..65a05f19eb68 100644
--- a/include/net/wpan-phy.h
+++ b/include/net/wpan-phy.h
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  */
diff --git a/net/ieee802154/af802154.h b/net/ieee802154/af802154.h
index 8330a09bfc95..343b63e6f953 100644
--- a/net/ieee802154/af802154.h
+++ b/net/ieee802154/af802154.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c
index 29e0de63001b..26da1e179737 100644
--- a/net/ieee802154/af_ieee802154.c
+++ b/net/ieee802154/af_ieee802154.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index ef2ad8aaef13..71e99a0994e1 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
index 5d352f86979e..42ae63a345ab 100644
--- a/net/ieee802154/ieee802154.h
+++ b/net/ieee802154/ieee802154.h
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 #ifndef IEEE_802154_LOCAL_H
 #define IEEE_802154_LOCAL_H
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 9222966f5e6d..6c3c2595a201 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index c6bfe22bfa5e..78a1529e8bb2 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 972baf83411a..e943e20bcf09 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 3a703ab88348..35c432668454 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 
 #include <linux/kernel.h>
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 9d1f64806f02..3ffcf4a9de01 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c
index 4955e0fe5883..9a2dfab5648d 100644
--- a/net/ieee802154/wpan-class.c
+++ b/net/ieee802154/wpan-class.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  */
 
 #include <linux/slab.h>
diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c
index b36b2b996578..6af6a2427aaf 100644
--- a/net/mac802154/ieee802154_dev.c
+++ b/net/mac802154/ieee802154_dev.c
@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #include <linux/kernel.h>
diff --git a/net/mac802154/mac802154.h b/net/mac802154/mac802154.h
index 762a6f849c6b..e3503c1bc04e 100644
--- a/net/mac802154/mac802154.h
+++ b/net/mac802154/mac802154.h
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
  * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index bf809131eef7..85f70edbe8dd 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Sergey Lapin <slapin@ossfans.org>
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 868a040fd422..d7e5df872be7 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  * Sergey Lapin <slapin@ossfans.org>
diff --git a/net/mac802154/monitor.c b/net/mac802154/monitor.c
index a68230e2b25f..81249bb7c935 100644
--- a/net/mac802154/monitor.c
+++ b/net/mac802154/monitor.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  * Sergey Lapin <slapin@ossfans.org>
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index a14cf9ede171..e99d9394d13a 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
  * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index fdf4c0e67259..95ea412395c7 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  * Sergey Lapin <slapin@ossfans.org>
diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c
index 4ab86a57dca5..b11a98d824e0 100644
--- a/net/mac802154/wpan.c
+++ b/net/mac802154/wpan.c
@@ -10,10 +10,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
  * Sergey Lapin <slapin@ossfans.org>
-- 
cgit v1.2.3-58-ga151


From 4ca24aca55fe1e2a61f3ffaac9015d9c45204729 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sat, 25 Oct 2014 09:41:04 +0200
Subject: ieee802154: move ieee802154 header

This patch moves the ieee802154 header into include/linux instead
include/net. Similar like wireless which have the ieee80211 header
inside of include/linux.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Cc: Alan Ott <alan@signal11.us>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c |   2 +-
 drivers/net/ieee802154/cc2520.c    |   2 +-
 drivers/net/ieee802154/fakehard.c  |   2 +-
 drivers/net/ieee802154/mrf24j40.c  |   2 +-
 include/linux/ieee802154.h         | 189 ++++++++++++++++++++++++++++++++++++
 include/net/ieee802154.h           | 191 -------------------------------------
 include/net/ieee802154_netdev.h    |   2 +-
 net/ieee802154/6lowpan_rtnl.c      |   2 +-
 net/ieee802154/dgram.c             |   2 +-
 net/ieee802154/header_ops.c        |   3 +-
 net/ieee802154/nl-mac.c            |   2 +-
 net/mac802154/iface.c              |   2 +-
 net/mac802154/llsec.c              |   2 +-
 net/mac802154/mac_cmd.c            |   2 +-
 net/mac802154/monitor.c            |   2 +-
 15 files changed, 203 insertions(+), 204 deletions(-)
 create mode 100644 include/linux/ieee802154.h
 delete mode 100644 include/net/ieee802154.h

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 795ac116602c..a433d20587e1 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -29,8 +29,8 @@
 #include <linux/regmap.h>
 #include <linux/skbuff.h>
 #include <linux/of_gpio.h>
+#include <linux/ieee802154.h>
 
-#include <net/ieee802154.h>
 #include <net/mac802154.h>
 #include <net/cfg802154.h>
 
diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c
index f1770cf892ed..32b3c8862b4f 100644
--- a/drivers/net/ieee802154/cc2520.c
+++ b/drivers/net/ieee802154/cc2520.c
@@ -21,10 +21,10 @@
 #include <linux/skbuff.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/of_gpio.h>
+#include <linux/ieee802154.h>
 
 #include <net/mac802154.h>
 #include <net/cfg802154.h>
-#include <net/ieee802154.h>
 
 #define	SPI_COMMAND_BUFFER	3
 #define	HIGH			1
diff --git a/drivers/net/ieee802154/fakehard.c b/drivers/net/ieee802154/fakehard.c
index 1460bf520498..8be05ade0ceb 100644
--- a/drivers/net/ieee802154/fakehard.c
+++ b/drivers/net/ieee802154/fakehard.c
@@ -25,10 +25,10 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
+#include <linux/ieee802154.h>
 
 #include <net/af_ieee802154.h>
 #include <net/ieee802154_netdev.h>
-#include <net/ieee802154.h>
 #include <net/nl802154.h>
 #include <net/cfg802154.h>
 
diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index bea7349db6ad..56a69599ac31 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -18,9 +18,9 @@
 #include <linux/spi/spi.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/ieee802154.h>
 #include <net/cfg802154.h>
 #include <net/mac802154.h>
-#include <net/ieee802154.h>
 
 /* MRF24J40 Short Address Registers */
 #define REG_RXMCR    0x00  /* Receive MAC control */
diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
new file mode 100644
index 000000000000..2dfab2db103a
--- /dev/null
+++ b/include/linux/ieee802154.h
@@ -0,0 +1,189 @@
+/*
+ * IEEE802.15.4-2003 specification
+ *
+ * Copyright (C) 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#ifndef LINUX_IEEE802154_H
+#define LINUX_IEEE802154_H
+
+#define IEEE802154_MTU			127
+
+#define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
+#define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
+#define IEEE802154_FC_TYPE_ACK		0x2	/* Frame is acknowledgment */
+#define IEEE802154_FC_TYPE_MAC_CMD	0x3	/* Frame is MAC command */
+
+#define IEEE802154_FC_TYPE_SHIFT		0
+#define IEEE802154_FC_TYPE_MASK		((1 << 3) - 1)
+#define IEEE802154_FC_TYPE(x)		((x & IEEE802154_FC_TYPE_MASK) >> IEEE802154_FC_TYPE_SHIFT)
+#define IEEE802154_FC_SET_TYPE(v, x)	do {	\
+	v = (((v) & ~IEEE802154_FC_TYPE_MASK) | \
+	    (((x) << IEEE802154_FC_TYPE_SHIFT) & IEEE802154_FC_TYPE_MASK)); \
+	} while (0)
+
+#define IEEE802154_FC_SECEN_SHIFT	3
+#define IEEE802154_FC_SECEN		(1 << IEEE802154_FC_SECEN_SHIFT)
+#define IEEE802154_FC_FRPEND_SHIFT	4
+#define IEEE802154_FC_FRPEND		(1 << IEEE802154_FC_FRPEND_SHIFT)
+#define IEEE802154_FC_ACK_REQ_SHIFT	5
+#define IEEE802154_FC_ACK_REQ		(1 << IEEE802154_FC_ACK_REQ_SHIFT)
+#define IEEE802154_FC_INTRA_PAN_SHIFT	6
+#define IEEE802154_FC_INTRA_PAN		(1 << IEEE802154_FC_INTRA_PAN_SHIFT)
+
+#define IEEE802154_FC_SAMODE_SHIFT	14
+#define IEEE802154_FC_SAMODE_MASK	(3 << IEEE802154_FC_SAMODE_SHIFT)
+#define IEEE802154_FC_DAMODE_SHIFT	10
+#define IEEE802154_FC_DAMODE_MASK	(3 << IEEE802154_FC_DAMODE_SHIFT)
+
+#define IEEE802154_FC_VERSION_SHIFT	12
+#define IEEE802154_FC_VERSION_MASK	(3 << IEEE802154_FC_VERSION_SHIFT)
+#define IEEE802154_FC_VERSION(x)	((x & IEEE802154_FC_VERSION_MASK) >> IEEE802154_FC_VERSION_SHIFT)
+
+#define IEEE802154_FC_SAMODE(x)		\
+	(((x) & IEEE802154_FC_SAMODE_MASK) >> IEEE802154_FC_SAMODE_SHIFT)
+
+#define IEEE802154_FC_DAMODE(x)		\
+	(((x) & IEEE802154_FC_DAMODE_MASK) >> IEEE802154_FC_DAMODE_SHIFT)
+
+#define IEEE802154_SCF_SECLEVEL_MASK		7
+#define IEEE802154_SCF_SECLEVEL_SHIFT		0
+#define IEEE802154_SCF_SECLEVEL(x)		(x & IEEE802154_SCF_SECLEVEL_MASK)
+#define IEEE802154_SCF_KEY_ID_MODE_SHIFT	3
+#define IEEE802154_SCF_KEY_ID_MODE_MASK		(3 << IEEE802154_SCF_KEY_ID_MODE_SHIFT)
+#define IEEE802154_SCF_KEY_ID_MODE(x)		\
+	((x & IEEE802154_SCF_KEY_ID_MODE_MASK) >> IEEE802154_SCF_KEY_ID_MODE_SHIFT)
+
+#define IEEE802154_SCF_KEY_IMPLICIT		0
+#define IEEE802154_SCF_KEY_INDEX		1
+#define IEEE802154_SCF_KEY_SHORT_INDEX		2
+#define IEEE802154_SCF_KEY_HW_INDEX		3
+
+#define IEEE802154_SCF_SECLEVEL_NONE		0
+#define IEEE802154_SCF_SECLEVEL_MIC32		1
+#define IEEE802154_SCF_SECLEVEL_MIC64		2
+#define IEEE802154_SCF_SECLEVEL_MIC128		3
+#define IEEE802154_SCF_SECLEVEL_ENC		4
+#define IEEE802154_SCF_SECLEVEL_ENC_MIC32	5
+#define IEEE802154_SCF_SECLEVEL_ENC_MIC64	6
+#define IEEE802154_SCF_SECLEVEL_ENC_MIC128	7
+
+/* MAC footer size */
+#define IEEE802154_MFR_SIZE	2 /* 2 octets */
+
+/* MAC's Command Frames Identifiers */
+#define IEEE802154_CMD_ASSOCIATION_REQ		0x01
+#define IEEE802154_CMD_ASSOCIATION_RESP		0x02
+#define IEEE802154_CMD_DISASSOCIATION_NOTIFY	0x03
+#define IEEE802154_CMD_DATA_REQ			0x04
+#define IEEE802154_CMD_PANID_CONFLICT_NOTIFY	0x05
+#define IEEE802154_CMD_ORPHAN_NOTIFY		0x06
+#define IEEE802154_CMD_BEACON_REQ		0x07
+#define IEEE802154_CMD_COORD_REALIGN_NOTIFY	0x08
+#define IEEE802154_CMD_GTS_REQ			0x09
+
+/*
+ * The return values of MAC operations
+ */
+enum {
+	/*
+	 * The requested operation was completed successfully.
+	 * For a transmission request, this value indicates
+	 * a successful transmission.
+	 */
+	IEEE802154_SUCCESS = 0x0,
+
+	/* The beacon was lost following a synchronization request. */
+	IEEE802154_BEACON_LOSS = 0xe0,
+	/*
+	 * A transmission could not take place due to activity on the
+	 * channel, i.e., the CSMA-CA mechanism has failed.
+	 */
+	IEEE802154_CHNL_ACCESS_FAIL = 0xe1,
+	/* The GTS request has been denied by the PAN coordinator. */
+	IEEE802154_DENINED = 0xe2,
+	/* The attempt to disable the transceiver has failed. */
+	IEEE802154_DISABLE_TRX_FAIL = 0xe3,
+	/*
+	 * The received frame induces a failed security check according to
+	 * the security suite.
+	 */
+	IEEE802154_FAILED_SECURITY_CHECK = 0xe4,
+	/*
+	 * The frame resulting from secure processing has a length that is
+	 * greater than aMACMaxFrameSize.
+	 */
+	IEEE802154_FRAME_TOO_LONG = 0xe5,
+	/*
+	 * The requested GTS transmission failed because the specified GTS
+	 * either did not have a transmit GTS direction or was not defined.
+	 */
+	IEEE802154_INVALID_GTS = 0xe6,
+	/*
+	 * A request to purge an MSDU from the transaction queue was made using
+	 * an MSDU handle that was not found in the transaction table.
+	 */
+	IEEE802154_INVALID_HANDLE = 0xe7,
+	/* A parameter in the primitive is out of the valid range.*/
+	IEEE802154_INVALID_PARAMETER = 0xe8,
+	/* No acknowledgment was received after aMaxFrameRetries. */
+	IEEE802154_NO_ACK = 0xe9,
+	/* A scan operation failed to find any network beacons.*/
+	IEEE802154_NO_BEACON = 0xea,
+	/* No response data were available following a request. */
+	IEEE802154_NO_DATA = 0xeb,
+	/* The operation failed because a short address was not allocated. */
+	IEEE802154_NO_SHORT_ADDRESS = 0xec,
+	/*
+	 * A receiver enable request was unsuccessful because it could not be
+	 * completed within the CAP.
+	 */
+	IEEE802154_OUT_OF_CAP = 0xed,
+	/*
+	 * A PAN identifier conflict has been detected and communicated to the
+	 * PAN coordinator.
+	 */
+	IEEE802154_PANID_CONFLICT = 0xee,
+	/* A coordinator realignment command has been received. */
+	IEEE802154_REALIGMENT = 0xef,
+	/* The transaction has expired and its information discarded. */
+	IEEE802154_TRANSACTION_EXPIRED = 0xf0,
+	/* There is no capacity to store the transaction. */
+	IEEE802154_TRANSACTION_OVERFLOW = 0xf1,
+	/*
+	 * The transceiver was in the transmitter enabled state when the
+	 * receiver was requested to be enabled.
+	 */
+	IEEE802154_TX_ACTIVE = 0xf2,
+	/* The appropriate key is not available in the ACL. */
+	IEEE802154_UNAVAILABLE_KEY = 0xf3,
+	/*
+	 * A SET/GET request was issued with the identifier of a PIB attribute
+	 * that is not supported.
+	 */
+	IEEE802154_UNSUPPORTED_ATTR = 0xf4,
+	/*
+	 * A request to perform a scan operation failed because the MLME was
+	 * in the process of performing a previously initiated scan operation.
+	 */
+	IEEE802154_SCAN_IN_PROGRESS = 0xfc,
+};
+
+
+#endif /* LINUX_IEEE802154_H */
diff --git a/include/net/ieee802154.h b/include/net/ieee802154.h
deleted file mode 100644
index 4db4e320b2f5..000000000000
--- a/include/net/ieee802154.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * IEEE802.15.4-2003 specification
- *
- * Copyright (C) 2007, 2008 Siemens AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Written by:
- * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
- * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
- * Maxim Osipov <maxim.osipov@siemens.com>
- * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
- * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
- */
-
-#ifndef NET_IEEE802154_H
-#define NET_IEEE802154_H
-
-#define IEEE802154_MTU			127
-
-#define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
-#define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
-#define IEEE802154_FC_TYPE_ACK		0x2	/* Frame is acknowledgment */
-#define IEEE802154_FC_TYPE_MAC_CMD	0x3	/* Frame is MAC command */
-
-#define IEEE802154_FC_TYPE_SHIFT		0
-#define IEEE802154_FC_TYPE_MASK		((1 << 3) - 1)
-#define IEEE802154_FC_TYPE(x)		((x & IEEE802154_FC_TYPE_MASK) >> IEEE802154_FC_TYPE_SHIFT)
-#define IEEE802154_FC_SET_TYPE(v, x)	do {	\
-	v = (((v) & ~IEEE802154_FC_TYPE_MASK) | \
-	    (((x) << IEEE802154_FC_TYPE_SHIFT) & IEEE802154_FC_TYPE_MASK)); \
-	} while (0)
-
-#define IEEE802154_FC_SECEN_SHIFT	3
-#define IEEE802154_FC_SECEN		(1 << IEEE802154_FC_SECEN_SHIFT)
-#define IEEE802154_FC_FRPEND_SHIFT	4
-#define IEEE802154_FC_FRPEND		(1 << IEEE802154_FC_FRPEND_SHIFT)
-#define IEEE802154_FC_ACK_REQ_SHIFT	5
-#define IEEE802154_FC_ACK_REQ		(1 << IEEE802154_FC_ACK_REQ_SHIFT)
-#define IEEE802154_FC_INTRA_PAN_SHIFT	6
-#define IEEE802154_FC_INTRA_PAN		(1 << IEEE802154_FC_INTRA_PAN_SHIFT)
-
-#define IEEE802154_FC_SAMODE_SHIFT	14
-#define IEEE802154_FC_SAMODE_MASK	(3 << IEEE802154_FC_SAMODE_SHIFT)
-#define IEEE802154_FC_DAMODE_SHIFT	10
-#define IEEE802154_FC_DAMODE_MASK	(3 << IEEE802154_FC_DAMODE_SHIFT)
-
-#define IEEE802154_FC_VERSION_SHIFT	12
-#define IEEE802154_FC_VERSION_MASK	(3 << IEEE802154_FC_VERSION_SHIFT)
-#define IEEE802154_FC_VERSION(x)	((x & IEEE802154_FC_VERSION_MASK) >> IEEE802154_FC_VERSION_SHIFT)
-
-#define IEEE802154_FC_SAMODE(x)		\
-	(((x) & IEEE802154_FC_SAMODE_MASK) >> IEEE802154_FC_SAMODE_SHIFT)
-
-#define IEEE802154_FC_DAMODE(x)		\
-	(((x) & IEEE802154_FC_DAMODE_MASK) >> IEEE802154_FC_DAMODE_SHIFT)
-
-#define IEEE802154_SCF_SECLEVEL_MASK		7
-#define IEEE802154_SCF_SECLEVEL_SHIFT		0
-#define IEEE802154_SCF_SECLEVEL(x)		(x & IEEE802154_SCF_SECLEVEL_MASK)
-#define IEEE802154_SCF_KEY_ID_MODE_SHIFT	3
-#define IEEE802154_SCF_KEY_ID_MODE_MASK		(3 << IEEE802154_SCF_KEY_ID_MODE_SHIFT)
-#define IEEE802154_SCF_KEY_ID_MODE(x)		\
-	((x & IEEE802154_SCF_KEY_ID_MODE_MASK) >> IEEE802154_SCF_KEY_ID_MODE_SHIFT)
-
-#define IEEE802154_SCF_KEY_IMPLICIT		0
-#define IEEE802154_SCF_KEY_INDEX		1
-#define IEEE802154_SCF_KEY_SHORT_INDEX		2
-#define IEEE802154_SCF_KEY_HW_INDEX		3
-
-#define IEEE802154_SCF_SECLEVEL_NONE		0
-#define IEEE802154_SCF_SECLEVEL_MIC32		1
-#define IEEE802154_SCF_SECLEVEL_MIC64		2
-#define IEEE802154_SCF_SECLEVEL_MIC128		3
-#define IEEE802154_SCF_SECLEVEL_ENC		4
-#define IEEE802154_SCF_SECLEVEL_ENC_MIC32	5
-#define IEEE802154_SCF_SECLEVEL_ENC_MIC64	6
-#define IEEE802154_SCF_SECLEVEL_ENC_MIC128	7
-
-/* MAC footer size */
-#define IEEE802154_MFR_SIZE	2 /* 2 octets */
-
-/* MAC's Command Frames Identifiers */
-#define IEEE802154_CMD_ASSOCIATION_REQ		0x01
-#define IEEE802154_CMD_ASSOCIATION_RESP		0x02
-#define IEEE802154_CMD_DISASSOCIATION_NOTIFY	0x03
-#define IEEE802154_CMD_DATA_REQ			0x04
-#define IEEE802154_CMD_PANID_CONFLICT_NOTIFY	0x05
-#define IEEE802154_CMD_ORPHAN_NOTIFY		0x06
-#define IEEE802154_CMD_BEACON_REQ		0x07
-#define IEEE802154_CMD_COORD_REALIGN_NOTIFY	0x08
-#define IEEE802154_CMD_GTS_REQ			0x09
-
-/*
- * The return values of MAC operations
- */
-enum {
-	/*
-	 * The requested operation was completed successfully.
-	 * For a transmission request, this value indicates
-	 * a successful transmission.
-	 */
-	IEEE802154_SUCCESS = 0x0,
-
-	/* The beacon was lost following a synchronization request. */
-	IEEE802154_BEACON_LOSS = 0xe0,
-	/*
-	 * A transmission could not take place due to activity on the
-	 * channel, i.e., the CSMA-CA mechanism has failed.
-	 */
-	IEEE802154_CHNL_ACCESS_FAIL = 0xe1,
-	/* The GTS request has been denied by the PAN coordinator. */
-	IEEE802154_DENINED = 0xe2,
-	/* The attempt to disable the transceiver has failed. */
-	IEEE802154_DISABLE_TRX_FAIL = 0xe3,
-	/*
-	 * The received frame induces a failed security check according to
-	 * the security suite.
-	 */
-	IEEE802154_FAILED_SECURITY_CHECK = 0xe4,
-	/*
-	 * The frame resulting from secure processing has a length that is
-	 * greater than aMACMaxFrameSize.
-	 */
-	IEEE802154_FRAME_TOO_LONG = 0xe5,
-	/*
-	 * The requested GTS transmission failed because the specified GTS
-	 * either did not have a transmit GTS direction or was not defined.
-	 */
-	IEEE802154_INVALID_GTS = 0xe6,
-	/*
-	 * A request to purge an MSDU from the transaction queue was made using
-	 * an MSDU handle that was not found in the transaction table.
-	 */
-	IEEE802154_INVALID_HANDLE = 0xe7,
-	/* A parameter in the primitive is out of the valid range.*/
-	IEEE802154_INVALID_PARAMETER = 0xe8,
-	/* No acknowledgment was received after aMaxFrameRetries. */
-	IEEE802154_NO_ACK = 0xe9,
-	/* A scan operation failed to find any network beacons.*/
-	IEEE802154_NO_BEACON = 0xea,
-	/* No response data were available following a request. */
-	IEEE802154_NO_DATA = 0xeb,
-	/* The operation failed because a short address was not allocated. */
-	IEEE802154_NO_SHORT_ADDRESS = 0xec,
-	/*
-	 * A receiver enable request was unsuccessful because it could not be
-	 * completed within the CAP.
-	 */
-	IEEE802154_OUT_OF_CAP = 0xed,
-	/*
-	 * A PAN identifier conflict has been detected and communicated to the
-	 * PAN coordinator.
-	 */
-	IEEE802154_PANID_CONFLICT = 0xee,
-	/* A coordinator realignment command has been received. */
-	IEEE802154_REALIGMENT = 0xef,
-	/* The transaction has expired and its information discarded. */
-	IEEE802154_TRANSACTION_EXPIRED = 0xf0,
-	/* There is no capacity to store the transaction. */
-	IEEE802154_TRANSACTION_OVERFLOW = 0xf1,
-	/*
-	 * The transceiver was in the transmitter enabled state when the
-	 * receiver was requested to be enabled.
-	 */
-	IEEE802154_TX_ACTIVE = 0xf2,
-	/* The appropriate key is not available in the ACL. */
-	IEEE802154_UNAVAILABLE_KEY = 0xf3,
-	/*
-	 * A SET/GET request was issued with the identifier of a PIB attribute
-	 * that is not supported.
-	 */
-	IEEE802154_UNSUPPORTED_ATTR = 0xf4,
-	/*
-	 * A request to perform a scan operation failed because the MLME was
-	 * in the process of performing a previously initiated scan operation.
-	 */
-	IEEE802154_SCAN_IN_PROGRESS = 0xfc,
-};
-
-
-#endif
-
-
diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index f87420689d70..5e62d758eea5 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -23,10 +23,10 @@
 #ifndef IEEE802154_NETDEVICE_H
 #define IEEE802154_NETDEVICE_H
 
-#include <net/ieee802154.h>
 #include <net/af_ieee802154.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/ieee802154.h>
 
 struct ieee802154_sechdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c
index 0c1a49b51e57..1779a08d110a 100644
--- a/net/ieee802154/6lowpan_rtnl.c
+++ b/net/ieee802154/6lowpan_rtnl.c
@@ -49,8 +49,8 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/netdevice.h>
+#include <linux/ieee802154.h>
 #include <net/af_ieee802154.h>
-#include <net/ieee802154.h>
 #include <net/ieee802154_netdev.h>
 #include <net/6lowpan.h>
 #include <net/ipv6.h>
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index 71e99a0994e1..3d58befef467 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -23,9 +23,9 @@
 #include <linux/if_arp.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/ieee802154.h>
 #include <net/sock.h>
 #include <net/af_ieee802154.h>
-#include <net/ieee802154.h>
 #include <net/ieee802154_netdev.h>
 
 #include <asm/ioctls.h>
diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c
index c09294e39ca6..a051b6993177 100644
--- a/net/ieee802154/header_ops.c
+++ b/net/ieee802154/header_ops.c
@@ -14,8 +14,9 @@
  * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
  */
 
+#include <linux/ieee802154.h>
+
 #include <net/mac802154.h>
-#include <net/ieee802154.h>
 #include <net/ieee802154_netdev.h>
 
 static int
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index a9c8e3e98387..fb6866d1dd39 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/if_arp.h>
 #include <linux/netdevice.h>
+#include <linux/ieee802154.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
 #include <net/sock.h>
@@ -29,7 +30,6 @@
 #include <linux/export.h>
 #include <net/af_ieee802154.h>
 #include <net/nl802154.h>
-#include <net/ieee802154.h>
 #include <net/ieee802154_netdev.h>
 #include <net/cfg802154.h>
 
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 5a604074555b..03eedc3b23ef 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -20,13 +20,13 @@
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/if_arp.h>
+#include <linux/ieee802154.h>
 
 #include <net/rtnetlink.h>
 #include <linux/nl802154.h>
 #include <net/af_ieee802154.h>
 #include <net/mac802154.h>
 #include <net/ieee802154_netdev.h>
-#include <net/ieee802154.h>
 #include <net/cfg802154.h>
 
 #include "ieee802154_i.h"
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 26f876128ae0..fa0d5237c2e0 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -17,7 +17,7 @@
 #include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/completion.h>
-#include <net/ieee802154.h>
+#include <linux/ieee802154.h>
 #include <crypto/algapi.h>
 
 #include "ieee802154_i.h"
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index f118ea06d344..ad09d54bc690 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -20,8 +20,8 @@
 
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
+#include <linux/ieee802154.h>
 
-#include <net/ieee802154.h>
 #include <net/ieee802154_netdev.h>
 #include <net/cfg802154.h>
 #include <net/mac802154.h>
diff --git a/net/mac802154/monitor.c b/net/mac802154/monitor.c
index ca1dedd9b229..ca82c72a635c 100644
--- a/net/mac802154/monitor.c
+++ b/net/mac802154/monitor.c
@@ -21,8 +21,8 @@
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
 #include <linux/crc-ccitt.h>
+#include <linux/ieee802154.h>
 
-#include <net/ieee802154.h>
 #include <net/mac802154.h>
 #include <net/netlink.h>
 #include <net/cfg802154.h>
-- 
cgit v1.2.3-58-ga151


From fa491001e4edae5ed68a562b61ed729968a3ca1c Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Mon, 27 Oct 2014 17:13:40 +0100
Subject: ieee802154: add valid psdu length helper

This patch adds a generic valid psdu length check function helper. This
is useful to check the length field after receiving. For example the
at86rf231 doesn't filter invalid psdu length. Sometimes the CRC can also
be correct. If we get the lqi value with an invalid frame length the
kernel may crash because we dereference an invalid pointer in the
receive buffer.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 2dfab2db103a..6e50a2a1d485 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -23,7 +23,10 @@
 #ifndef LINUX_IEEE802154_H
 #define LINUX_IEEE802154_H
 
+#include <linux/types.h>
+
 #define IEEE802154_MTU			127
+#define IEEE802154_MIN_PSDU_LEN		5
 
 #define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
 #define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
@@ -185,5 +188,13 @@ enum {
 	IEEE802154_SCAN_IN_PROGRESS = 0xfc,
 };
 
+/**
+ * ieee802154_is_valid_psdu_len - check if psdu len is valid
+ * @len: psdu len with (MHR + payload + MFR)
+ */
+static inline bool ieee802154_is_valid_psdu_len(const u8 len)
+{
+	return (len >= IEEE802154_MIN_PSDU_LEN && len <= IEEE802154_MTU);
+}
 
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3-58-ga151


From a59dadbeeaf7d33f2e92dbf5a290965d6df64162 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sat, 25 Oct 2014 17:19:33 +0200
Subject: ath9k: add support for endian swap of eeprom from platform data

On some devices (especially little-endian ones), the flash EEPROM data
has a different endian, which needs to be detected.
Add a flag to the platform data to allow overriding that behavior

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/eeprom_def.c | 31 ++++++++++-------------------
 drivers/net/wireless/ath/ath9k/hw.h         |  1 +
 drivers/net/wireless/ath/ath9k/init.c       |  2 ++
 include/linux/ath9k_platform.h              |  1 +
 4 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/eeprom_def.c b/drivers/net/wireless/ath/ath9k/eeprom_def.c
index 3218ca994746..122b846b8ec0 100644
--- a/drivers/net/wireless/ath/ath9k/eeprom_def.c
+++ b/drivers/net/wireless/ath/ath9k/eeprom_def.c
@@ -262,7 +262,7 @@ static int ath9k_hw_def_check_eeprom(struct ath_hw *ah)
 {
 	struct ar5416_eeprom_def *eep = &ah->eeprom.def;
 	struct ath_common *common = ath9k_hw_common(ah);
-	u16 *eepdata, temp, magic, magic2;
+	u16 *eepdata, temp, magic;
 	u32 sum = 0, el;
 	bool need_swap = false;
 	int i, addr, size;
@@ -272,27 +272,16 @@ static int ath9k_hw_def_check_eeprom(struct ath_hw *ah)
 		return false;
 	}
 
-	if (!ath9k_hw_use_flash(ah)) {
-		ath_dbg(common, EEPROM, "Read Magic = 0x%04X\n", magic);
-
-		if (magic != AR5416_EEPROM_MAGIC) {
-			magic2 = swab16(magic);
-
-			if (magic2 == AR5416_EEPROM_MAGIC) {
-				size = sizeof(struct ar5416_eeprom_def);
-				need_swap = true;
-				eepdata = (u16 *) (&ah->eeprom);
+	if (swab16(magic) == AR5416_EEPROM_MAGIC &&
+	    !(ah->ah_flags & AH_NO_EEP_SWAP)) {
+		size = sizeof(struct ar5416_eeprom_def);
+		need_swap = true;
+		eepdata = (u16 *) (&ah->eeprom);
 
-				for (addr = 0; addr < size / sizeof(u16); addr++) {
-					temp = swab16(*eepdata);
-					*eepdata = temp;
-					eepdata++;
-				}
-			} else {
-				ath_err(common,
-					"Invalid EEPROM Magic. Endianness mismatch.\n");
-				return -EINVAL;
-			}
+		for (addr = 0; addr < size / sizeof(u16); addr++) {
+			temp = swab16(*eepdata);
+			*eepdata = temp;
+			eepdata++;
 		}
 	}
 
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index f204099c38b8..c6dba9b4afdf 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -731,6 +731,7 @@ enum ath_cal_list {
 #define AH_USE_EEPROM   0x1
 #define AH_UNPLUGGED    0x2 /* The card has been physically removed. */
 #define AH_FASTCC       0x4
+#define AH_NO_EEP_SWAP  0x8 /* Do not swap EEPROM data */
 
 struct ath_hw {
 	struct ath_ops reg_ops;
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 57a17601b0b6..5d9c711b6aa0 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -531,6 +531,8 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
 		ah->is_clk_25mhz = pdata->is_clk_25mhz;
 		ah->get_mac_revision = pdata->get_mac_revision;
 		ah->external_reset = pdata->external_reset;
+		if (!pdata->endian_check)
+			ah->ah_flags |= AH_NO_EEP_SWAP;
 	}
 
 	common->ops = &ah->reg_ops;
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index a495a959e8a7..43501657bce9 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -31,6 +31,7 @@ struct ath9k_platform_data {
 	u32 gpio_mask;
 	u32 gpio_val;
 
+	bool endian_check;
 	bool is_clk_25mhz;
 	bool tx_gain_buffalo;
 
-- 
cgit v1.2.3-58-ga151


From 3468968ef766d7bb4ab29c0ef7ebd169a4ac2e96 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sat, 25 Oct 2014 17:19:34 +0200
Subject: ath9k: allow disabling bands via platform data

Some devices have multiple bands enables in the EEPROM data, even though
they are only calibrated for one. Allow platform data to disable
unsupported bands.

Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/hw.c   | 24 ++++++++++++++++--------
 drivers/net/wireless/ath/ath9k/hw.h   |  2 ++
 drivers/net/wireless/ath/ath9k/init.c |  2 ++
 include/linux/ath9k_platform.h        |  2 ++
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 85a78176b809..47f410ed7cd4 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -2344,17 +2344,25 @@ int ath9k_hw_fill_cap_info(struct ath_hw *ah)
 	}
 
 	eeval = ah->eep_ops->get_eeprom(ah, EEP_OP_MODE);
-	if ((eeval & (AR5416_OPFLAGS_11G | AR5416_OPFLAGS_11A)) == 0) {
-		ath_err(common,
-			"no band has been marked as supported in EEPROM\n");
-		return -EINVAL;
+
+	if (eeval & AR5416_OPFLAGS_11A) {
+		if (ah->disable_5ghz)
+			ath_warn(common, "disabling 5GHz band\n");
+		else
+			pCap->hw_caps |= ATH9K_HW_CAP_5GHZ;
 	}
 
-	if (eeval & AR5416_OPFLAGS_11A)
-		pCap->hw_caps |= ATH9K_HW_CAP_5GHZ;
+	if (eeval & AR5416_OPFLAGS_11G) {
+		if (ah->disable_2ghz)
+			ath_warn(common, "disabling 2GHz band\n");
+		else
+			pCap->hw_caps |= ATH9K_HW_CAP_2GHZ;
+	}
 
-	if (eeval & AR5416_OPFLAGS_11G)
-		pCap->hw_caps |= ATH9K_HW_CAP_2GHZ;
+	if ((pCap->hw_caps & (ATH9K_HW_CAP_2GHZ | ATH9K_HW_CAP_5GHZ)) == 0) {
+		ath_err(common, "both bands are disabled\n");
+		return -EINVAL;
+	}
 
 	if (AR_SREV_9485(ah) ||
 	    AR_SREV_9285(ah) ||
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index c6dba9b4afdf..e49721e85f6a 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -930,6 +930,8 @@ struct ath_hw {
 	bool is_clk_25mhz;
 	int (*get_mac_revision)(void);
 	int (*external_reset)(void);
+	bool disable_2ghz;
+	bool disable_5ghz;
 
 	const struct firmware *eeprom_blob;
 
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 5d9c711b6aa0..2294109f79e9 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -531,6 +531,8 @@ static int ath9k_init_softc(u16 devid, struct ath_softc *sc,
 		ah->is_clk_25mhz = pdata->is_clk_25mhz;
 		ah->get_mac_revision = pdata->get_mac_revision;
 		ah->external_reset = pdata->external_reset;
+		ah->disable_2ghz = pdata->disable_2ghz;
+		ah->disable_5ghz = pdata->disable_5ghz;
 		if (!pdata->endian_check)
 			ah->ah_flags |= AH_NO_EEP_SWAP;
 	}
diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h
index 43501657bce9..33eb274cd0e6 100644
--- a/include/linux/ath9k_platform.h
+++ b/include/linux/ath9k_platform.h
@@ -34,6 +34,8 @@ struct ath9k_platform_data {
 	bool endian_check;
 	bool is_clk_25mhz;
 	bool tx_gain_buffalo;
+	bool disable_2ghz;
+	bool disable_5ghz;
 
 	int (*get_mac_revision)(void);
 	int (*external_reset)(void);
-- 
cgit v1.2.3-58-ga151


From 95f259ca3bf485a0c1f17d9024813d4aab485a23 Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Sat, 25 Oct 2014 12:08:59 +0200
Subject: net: pxa168_eth: Fix providing of phy_interface mode on platform_data

Do not add phy include to the board file but platform_data include
instead.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-mmp/gplugd.c | 1 -
 include/linux/pxa168_eth.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-mmp/gplugd.c b/arch/arm/mach-mmp/gplugd.c
index 3b5794cd0357..22762a1f9f72 100644
--- a/arch/arm/mach-mmp/gplugd.c
+++ b/arch/arm/mach-mmp/gplugd.c
@@ -12,7 +12,6 @@
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
 #include <linux/gpio-pxa.h>
-#include <linux/phy.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
diff --git a/include/linux/pxa168_eth.h b/include/linux/pxa168_eth.h
index 37c381120bc8..e1ab6e86cdb3 100644
--- a/include/linux/pxa168_eth.h
+++ b/include/linux/pxa168_eth.h
@@ -4,6 +4,8 @@
 #ifndef __LINUX_PXA168_ETH_H
 #define __LINUX_PXA168_ETH_H
 
+#include <linux/phy.h>
+
 struct pxa168_eth_platform_data {
 	int	port_number;
 	int	phy_addr;
-- 
cgit v1.2.3-58-ga151


From 32a173c7f9e9ec2b87142f67e1478cd20084a45b Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 27 Oct 2014 11:37:35 +0200
Subject: net/mlx4_core: Introduce mlx4_get_module_info for cable module info
 reading

Added new MAD_IFC command to read cable module info with attribute id (0xFF60).
Update include/linux/mlx4/device.h with function declaration (mlx4_get_module_info)
and the needed defines/enums for future use.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/port.c | 156 ++++++++++++++++++++++++++++++
 include/linux/mlx4/device.h               |  30 ++++++
 2 files changed, 186 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 94eeb2c7d7e4..30eb1ead0fe6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -1311,3 +1311,159 @@ int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
 	return 0;
 }
 EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
+
+/* Cable Module Info */
+#define MODULE_INFO_MAX_READ 48
+
+#define I2C_ADDR_LOW  0x50
+#define I2C_ADDR_HIGH 0x51
+#define I2C_PAGE_SIZE 256
+
+/* Module Info Data */
+struct mlx4_cable_info {
+	u8	i2c_addr;
+	u8	page_num;
+	__be16	dev_mem_address;
+	__be16	reserved1;
+	__be16	size;
+	__be32	reserved2[2];
+	u8	data[MODULE_INFO_MAX_READ];
+};
+
+enum cable_info_err {
+	 CABLE_INF_INV_PORT      = 0x1,
+	 CABLE_INF_OP_NOSUP      = 0x2,
+	 CABLE_INF_NOT_CONN      = 0x3,
+	 CABLE_INF_NO_EEPRM      = 0x4,
+	 CABLE_INF_PAGE_ERR      = 0x5,
+	 CABLE_INF_INV_ADDR      = 0x6,
+	 CABLE_INF_I2C_ADDR      = 0x7,
+	 CABLE_INF_QSFP_VIO      = 0x8,
+	 CABLE_INF_I2C_BUSY      = 0x9,
+};
+
+#define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF)
+
+static inline const char *cable_info_mad_err_str(u16 mad_status)
+{
+	u8 err = MAD_STATUS_2_CABLE_ERR(mad_status);
+
+	switch (err) {
+	case CABLE_INF_INV_PORT:
+		return "invalid port selected";
+	case CABLE_INF_OP_NOSUP:
+		return "operation not supported for this port (the port is of type CX4 or internal)";
+	case CABLE_INF_NOT_CONN:
+		return "cable is not connected";
+	case CABLE_INF_NO_EEPRM:
+		return "the connected cable has no EPROM (passive copper cable)";
+	case CABLE_INF_PAGE_ERR:
+		return "page number is greater than 15";
+	case CABLE_INF_INV_ADDR:
+		return "invalid device_address or size (that is, size equals 0 or address+size is greater than 256)";
+	case CABLE_INF_I2C_ADDR:
+		return "invalid I2C slave address";
+	case CABLE_INF_QSFP_VIO:
+		return "at least one cable violates the QSFP specification and ignores the modsel signal";
+	case CABLE_INF_I2C_BUSY:
+		return "I2C bus is constantly busy";
+	}
+	return "Unknown Error";
+}
+
+/**
+ * mlx4_get_module_info - Read cable module eeprom data
+ * @dev: mlx4_dev.
+ * @port: port number.
+ * @offset: byte offset in eeprom to start reading data from.
+ * @size: num of bytes to read.
+ * @data: output buffer to put the requested data into.
+ *
+ * Reads cable module eeprom data, puts the outcome data into
+ * data pointer paramer.
+ * Returns num of read bytes on success or a negative error
+ * code.
+ */
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_mad_ifc *inmad, *outmad;
+	struct mlx4_cable_info *cable_info;
+	u16 i2c_addr;
+	int ret;
+
+	if (size > MODULE_INFO_MAX_READ)
+		size = MODULE_INFO_MAX_READ;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inmad = (struct mlx4_mad_ifc *)(inbox->buf);
+	outmad = (struct mlx4_mad_ifc *)(outbox->buf);
+
+	inmad->method = 0x1; /* Get */
+	inmad->class_version = 0x1;
+	inmad->mgmt_class = 0x1;
+	inmad->base_version = 0x1;
+	inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */
+
+	if (offset < I2C_PAGE_SIZE && offset + size > I2C_PAGE_SIZE)
+		/* Cross pages reads are not allowed
+		 * read until offset 256 in low page
+		 */
+		size -= offset + size - I2C_PAGE_SIZE;
+
+	i2c_addr = I2C_ADDR_LOW;
+	if (offset >= I2C_PAGE_SIZE) {
+		/* Reset offset to high page */
+		i2c_addr = I2C_ADDR_HIGH;
+		offset -= I2C_PAGE_SIZE;
+	}
+
+	cable_info = (struct mlx4_cable_info *)inmad->data;
+	cable_info->dev_mem_address = cpu_to_be16(offset);
+	cable_info->page_num = 0;
+	cable_info->i2c_addr = i2c_addr;
+	cable_info->size = cpu_to_be16(size);
+
+	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	if (be16_to_cpu(outmad->status)) {
+		/* Mad returned with bad status */
+		ret = be16_to_cpu(outmad->status);
+		mlx4_warn(dev,
+			  "MLX4_CMD_MAD_IFC Get Module info attr(%x) port(%d) i2c_addr(%x) offset(%d) size(%d): Response Mad Status(%x) - %s\n",
+			  0xFF60, port, i2c_addr, offset, size,
+			  ret, cable_info_mad_err_str(ret));
+
+		if (i2c_addr == I2C_ADDR_HIGH &&
+		    MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR)
+			/* Some SFP cables do not support i2c slave
+			 * address 0x51 (high page), abort silently.
+			 */
+			ret = 0;
+		else
+			ret = -ret;
+		goto out;
+	}
+	cable_info = (struct mlx4_cable_info *)outmad->data;
+	memcpy(data, cable_info->data, size);
+	ret = size;
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_get_module_info);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 37e4404d0227..73910daec317 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -379,6 +379,13 @@ enum {
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
+enum mlx4_module_id {
+	MLX4_MODULE_ID_SFP              = 0x3,
+	MLX4_MODULE_ID_QSFP             = 0xC,
+	MLX4_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX4_MODULE_ID_QSFP28           = 0x11,
+};
+
 static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
 {
 	return (major << 32) | (minor << 16) | subminor;
@@ -799,6 +806,26 @@ struct mlx4_init_port_param {
 	u64			si_guid;
 };
 
+#define MAD_IFC_DATA_SZ 192
+/* MAD IFC Mailbox */
+struct mlx4_mad_ifc {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[MAD_IFC_DATA_SZ];
+} __packed;
+
 #define mlx4_foreach_port(port, dev, type)				\
 	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
 		if ((type) == (dev)->caps.port_mask[(port)])
@@ -1283,6 +1310,9 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
 			    u64 iova, u64 size, int npages,
 			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
 
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data);
+
 /* Returns true if running in low memory profile (kdump kernel) */
 static inline bool mlx4_low_memory_profile(void)
 {
-- 
cgit v1.2.3-58-ga151


From adbc7ac5c15eb5e9d70393428345e72a1a897d6a Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 27 Oct 2014 11:37:37 +0200
Subject: net/mlx4_core: Introduce ACCESS_REG CMD and eth_prot_ctrl dev cap

Adding ACCESS REG mlx4 command and use it to implement Query method for
PTYS (Port Type and Speed Register).
Query and store eth_prot_ctrl dev cap.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c |   9 +++
 drivers/net/ethernet/mellanox/mlx4/fw.c  | 118 ++++++++++++++++++++++++++++++-
 include/linux/mlx4/cmd.h                 |   2 +
 include/linux/mlx4/device.h              |  40 ++++++++++-
 4 files changed, 166 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index b16e1b95566f..916459effcfa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1338,6 +1338,15 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_QUERY_IF_STAT_wrapper
 	},
+	{
+		.opcode = MLX4_CMD_ACCESS_REG,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL,
+	},
 	/* Native multicast commands are not available for guests */
 	{
 		.opcode = MLX4_CMD_QP_ATTACH,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 2e88a235e26b..6fd9b8581e95 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -139,7 +139,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[10] = "TCP/IP offloads/flow-steering for VXLAN support",
 		[11] = "MAD DEMUX (Secure-Host) support",
 		[12] = "Large cache line (>64B) CQE stride support",
-		[13] = "Large cache line (>64B) EQE stride support"
+		[13] = "Large cache line (>64B) EQE stride support",
+		[14] = "Ethernet protocol control support"
 	};
 	int i;
 
@@ -560,6 +561,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
 #define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE	0x7a
+#define QUERY_DEV_CAP_ETH_PROT_CTRL_OFFSET	0x7a
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
@@ -737,11 +739,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
 	dev_cap->max_rq_desc_sz = size;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL;
 	if (field & (1 << 6))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
 	if (field & (1 << 7))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
-
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
@@ -2144,3 +2147,114 @@ out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
 }
+
+/* Access Reg commands */
+enum mlx4_access_reg_masks {
+	MLX4_ACCESS_REG_STATUS_MASK = 0x7f,
+	MLX4_ACCESS_REG_METHOD_MASK = 0x7f,
+	MLX4_ACCESS_REG_LEN_MASK = 0x7ff
+};
+
+struct mlx4_access_reg {
+	__be16 constant1;
+	u8 status;
+	u8 resrvd1;
+	__be16 reg_id;
+	u8 method;
+	u8 constant2;
+	__be32 resrvd2[2];
+	__be16 len_const;
+	__be16 resrvd3;
+#define MLX4_ACCESS_REG_HEADER_SIZE (20)
+	u8 reg_data[MLX4_MAILBOX_SIZE-MLX4_ACCESS_REG_HEADER_SIZE];
+} __attribute__((__packed__));
+
+/**
+ * mlx4_ACCESS_REG - Generic access reg command.
+ * @dev: mlx4_dev.
+ * @reg_id: register ID to access.
+ * @method: Access method Read/Write.
+ * @reg_len: register length to Read/Write in bytes.
+ * @reg_data: reg_data pointer to Read/Write From/To.
+ *
+ * Access ConnectX registers FW command.
+ * Returns 0 on success and copies outbox mlx4_access_reg data
+ * field into reg_data or a negative error code.
+ */
+static int mlx4_ACCESS_REG(struct mlx4_dev *dev, u16 reg_id,
+			   enum mlx4_access_reg_method method,
+			   u16 reg_len, void *reg_data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_access_reg *inbuf, *outbuf;
+	int err;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inbuf = inbox->buf;
+	outbuf = outbox->buf;
+
+	inbuf->constant1 = cpu_to_be16(0x1<<11 | 0x4);
+	inbuf->constant2 = 0x1;
+	inbuf->reg_id = cpu_to_be16(reg_id);
+	inbuf->method = method & MLX4_ACCESS_REG_METHOD_MASK;
+
+	reg_len = min(reg_len, (u16)(sizeof(inbuf->reg_data)));
+	inbuf->len_const =
+		cpu_to_be16(((reg_len/4 + 1) & MLX4_ACCESS_REG_LEN_MASK) |
+			    ((0x3) << 12));
+
+	memcpy(inbuf->reg_data, reg_data, reg_len);
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, 0, 0,
+			   MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	if (outbuf->status & MLX4_ACCESS_REG_STATUS_MASK) {
+		err = outbuf->status & MLX4_ACCESS_REG_STATUS_MASK;
+		mlx4_err(dev,
+			 "MLX4_CMD_ACCESS_REG(%x) returned REG status (%x)\n",
+			 reg_id, err);
+		goto out;
+	}
+
+	memcpy(reg_data, outbuf->reg_data, reg_len);
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return err;
+}
+
+/* ConnectX registers IDs */
+enum mlx4_reg_id {
+	MLX4_REG_ID_PTYS = 0x5004,
+};
+
+/**
+ * mlx4_ACCESS_PTYS_REG - Access PTYs (Port Type and Speed)
+ * register
+ * @dev: mlx4_dev.
+ * @method: Access method Read/Write.
+ * @ptys_reg: PTYS register data pointer.
+ *
+ * Access ConnectX PTYS register, to Read/Write Port Type/Speed
+ * configuration
+ * Returns 0 on success or a negative error code.
+ */
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg)
+{
+	return mlx4_ACCESS_REG(dev, MLX4_REG_ID_PTYS,
+			       method, sizeof(*ptys_reg), ptys_reg);
+}
+EXPORT_SYMBOL_GPL(mlx4_ACCESS_PTYS_REG);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 379c02648ab3..ff5f5deb3dcf 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -67,6 +67,8 @@ enum {
 	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
 	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
 	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+	MLX4_CMD_ACCESS_REG	 = 0x3b,
+
 	/*master notify fw on finish for slave's flr*/
 	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
 	MLX4_CMD_GET_OP_REQ      = 0x59,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 73910daec317..181cd9fc90f2 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -186,7 +186,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  10,
 	MLX4_DEV_CAP_FLAG2_MAD_DEMUX		= 1LL <<  11,
 	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
-	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13
+	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
+	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14
 };
 
 enum {
@@ -1319,4 +1320,41 @@ static inline bool mlx4_low_memory_profile(void)
 	return is_kdump_kernel();
 }
 
+/* ACCESS REG commands */
+enum mlx4_access_reg_method {
+	MLX4_ACCESS_REG_QUERY = 0x1,
+	MLX4_ACCESS_REG_WRITE = 0x2,
+};
+
+/* ACCESS PTYS Reg command */
+enum mlx4_ptys_proto {
+	MLX4_PTYS_IB = 1<<0,
+	MLX4_PTYS_EN = 1<<2,
+};
+
+struct mlx4_ptys_reg {
+	u8 resrvd1;
+	u8 local_port;
+	u8 resrvd2;
+	u8 proto_mask;
+	__be32 resrvd3[2];
+	__be32 eth_proto_cap;
+	__be16 ib_width_cap;
+	__be16 ib_speed_cap;
+	__be32 resrvd4;
+	__be32 eth_proto_admin;
+	__be16 ib_width_admin;
+	__be16 ib_speed_admin;
+	__be32 resrvd5;
+	__be32 eth_proto_oper;
+	__be16 ib_width_oper;
+	__be16 ib_speed_oper;
+	__be32 resrvd6;
+	__be32 eth_proto_lp_adv;
+} __packed;
+
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg);
+
 #endif /* MLX4_DEVICE_H */
-- 
cgit v1.2.3-58-ga151


From a53e3e8c1db547981e13d1ebf24a659bd4e87710 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 27 Oct 2014 11:37:38 +0200
Subject: net/mlx4_core: Add ethernet backplane autoneg device capability

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c | 7 ++++++-
 include/linux/mlx4/device.h             | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 6fd9b8581e95..72289ef5ebbe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -140,7 +140,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[11] = "MAD DEMUX (Secure-Host) support",
 		[12] = "Large cache line (>64B) CQE stride support",
 		[13] = "Large cache line (>64B) EQE stride support",
-		[14] = "Ethernet protocol control support"
+		[14] = "Ethernet protocol control support",
+		[15] = "Ethernet Backplane autoneg support"
 	};
 	int i;
 
@@ -575,6 +576,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
 #define QUERY_DEV_CAP_VXLAN			0x9e
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
@@ -749,6 +751,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
 	if (field & 1<<6)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 181cd9fc90f2..e4c136ebe79b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -187,7 +187,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_MAD_DEMUX		= 1LL <<  11,
 	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
 	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
-	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14
+	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15
 };
 
 enum {
-- 
cgit v1.2.3-58-ga151


From dca145ffaa8d39ea1904491ac81b92b7049372c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 Oct 2014 21:45:24 -0700
Subject: tcp: allow for bigger reordering level

While testing upcoming Yaogong patch (converting out of order queue
into an RB tree), I hit the max reordering level of linux TCP stack.

Reordering level was limited to 127 for no good reason, and some
network setups [1] can easily reach this limit and get limited
throughput.

Allow a new max limit of 300, and add a sysctl to allow admins to even
allow bigger (or lower) values if needed.

[1] Aggregation of links, per packet load balancing, fabrics not doing
 deep packet inspections, alternative TCP congestion modules...

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt   |  7 ++-----
 Documentation/networking/ip-sysctl.txt | 10 +++++++++-
 include/linux/tcp.h                    |  4 ++--
 include/net/tcp.h                      |  4 +---
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_input.c                   |  3 ++-
 6 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index eeb5b2e97bed..83bf4986baea 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single
 
 	It is possible to adjust TCP/IP's congestion limits by
 	altering the net.ipv4.tcp_reordering sysctl parameter.  The
-	usual default value is 3, and the maximum useful value is 127.
-	For a four interface balance-rr bond, expect that a single
-	TCP/IP stream will utilize no more than approximately 2.3
-	interface's worth of throughput, even after adjusting
-	tcp_reordering.
+	usual default value is 3. But keep in mind TCP stack is able
+	to automatically increase this when it detects reorders.
 
 	Note that the fraction of packets that will be delivered out of
 	order is highly variable, and is unlikely to be zero.  The level
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0307e2875f21..9028b879a97b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -376,9 +376,17 @@ tcp_orphan_retries - INTEGER
 	may consume significant resources. Cf. tcp_max_orphans.
 
 tcp_reordering - INTEGER
-	Maximal reordering of packets in a TCP stream.
+	Initial reordering level of packets in a TCP stream.
+	TCP stack can then dynamically adjust flow reordering level
+	between this initial value and tcp_max_reordering
 	Default: 3
 
+tcp_max_reordering - INTEGER
+	Maximal reordering level of packets in a TCP stream.
+	300 is a fairly conservative value, but you might increase it
+	if paths are using per packet load balancing (like bonding rr mode)
+	Default: 300
+
 tcp_retrans_collapse - BOOLEAN
 	Bug-to-bug compatibility with some broken printers.
 	On retransmit try to send bigger packets to work around bugs in
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index c2dee7deefa8..f566b8567892 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -204,10 +204,10 @@ struct tcp_sock {
 
 	u16	urg_data;	/* Saved octet of OOB data and control flags */
 	u8	ecn_flags;	/* ECN status bits.			*/
-	u8	reordering;	/* Packet reordering metric.		*/
+	u8	keepalive_probes; /* num of allowed keep alive probes	*/
+	u32	reordering;	/* Packet reordering metric.		*/
 	u32	snd_up;		/* Urgent pointer		*/
 
-	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 /*
  *      Options received (usually on last packet, some only on SYN packets).
  */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c73fc145ee45..3a35b1500359 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -70,9 +70,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
 #define TCP_FASTRETRANS_THRESH 3
 
-/* Maximal reordering. */
-#define TCP_MAX_REORDERING	127
-
 /* Maximal number of ACKs sent quickly to accelerate slow-start. */
 #define TCP_MAX_QUICKACKS	16U
 
@@ -252,6 +249,7 @@ extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
+extern int sysctl_tcp_max_reordering;
 extern int sysctl_tcp_dsack;
 extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b3c53c8b331e..e0ee384a448f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_max_reordering",
+		.data		= &sysctl_tcp_max_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "tcp_dsack",
 		.data		= &sysctl_tcp_dsack,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a12b455928e5..9a18cdd633f3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
 int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_max_reordering __read_mostly = 300;
 EXPORT_SYMBOL(sysctl_tcp_reordering);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
@@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 	if (metric > tp->reordering) {
 		int mib_idx;
 
-		tp->reordering = min(TCP_MAX_REORDERING, metric);
+		tp->reordering = min(sysctl_tcp_max_reordering, metric);
 
 		/* This exciting event is worth to be remembered. 8) */
 		if (ts)
-- 
cgit v1.2.3-58-ga151


From 7fd2561e4ebdd070ebba6d3326c4c5b13942323f Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Tue, 28 Oct 2014 18:11:14 +0900
Subject: net: ipv6: Add a sysctl to make optimistic addresses useful
 candidates

Add a sysctl that causes an interface's optimistic addresses
to be considered equivalent to other non-deprecated addresses
for source address selection purposes.  Preferred addresses
will still take precedence over optimistic addresses, subject
to other ranking in the source address selection algorithm.

This is useful where different interfaces are connected to
different networks from different ISPs (e.g., a cell network
and a home wifi network).

The current behaviour complies with RFC 3484/6724, and it
makes sense if the host has only one interface, or has
multiple interfaces on the same network (same or cooperating
administrative domain(s), but not in the multiple distinct
networks case.

For example, if a mobile device has an IPv6 address on an LTE
network and then connects to IPv6-enabled wifi, while the wifi
IPv6 address is undergoing DAD, IPv6 connections will try use
the wifi default route with the LTE IPv6 address, and will get
stuck until they time out.

Also, because optimistic nodes can receive frames, issue
an RTM_NEWADDR as soon as DAD starts (with the IFA_F_OPTIMSTIC
flag appropriately set).  A second RTM_NEWADDR is sent if DAD
completes (the address flags have changed), otherwise an
RTM_DELADDR is sent.

Also: add an entry in ip-sysctl.txt for optimistic_dad.

Signed-off-by: Erik Kline <ek@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 ++++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 46 ++++++++++++++++++++++++++++++++--
 4 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 9028b879a97b..368e3251c553 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1460,6 +1460,19 @@ suppress_frag_ndisc - INTEGER
 	1 - (default) discard fragmented neighbor discovery packets
 	0 - allow fragmented neighbor discovery packets
 
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+		0: disabled (default)
+		1: enabled
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+		0: disabled (default)
+		1: enabled
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ff560537dd61..7121a2e97ce2 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -42,6 +42,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_from_local;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	__s32		optimistic_dad;
+	__s32		use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	__s32		mc_forwarding;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index efa2666f4b8a..e863d088b9a5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -164,6 +164,7 @@ enum {
 	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_ACCEPT_RA_FROM_LOCAL,
+	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 50b95b2db87c..8d12b7c7018f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1170,6 +1170,9 @@ enum {
 	IPV6_SADDR_RULE_PRIVACY,
 	IPV6_SADDR_RULE_ORCHID,
 	IPV6_SADDR_RULE_PREFIX,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
 	IPV6_SADDR_RULE_MAX
 };
 
@@ -1197,6 +1200,15 @@ static inline int ipv6_saddr_preferred(int type)
 	return 0;
 }
 
+static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev)
+{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic;
+#else
+	return false;
+#endif
+}
+
 static int ipv6_get_saddr_eval(struct net *net,
 			       struct ipv6_saddr_score *score,
 			       struct ipv6_saddr_dst *dst,
@@ -1257,10 +1269,16 @@ static int ipv6_get_saddr_eval(struct net *net,
 		score->scopedist = ret;
 		break;
 	case IPV6_SADDR_RULE_PREFERRED:
+	    {
 		/* Rule 3: Avoid deprecated and optimistic addresses */
+		u8 avoid = IFA_F_DEPRECATED;
+
+		if (!ipv6_use_optimistic_addr(score->ifa->idev))
+			avoid |= IFA_F_OPTIMISTIC;
 		ret = ipv6_saddr_preferred(score->addr_type) ||
-		      !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
+		      !(score->ifa->flags & avoid);
 		break;
+	    }
 #ifdef CONFIG_IPV6_MIP6
 	case IPV6_SADDR_RULE_HOA:
 	    {
@@ -1306,6 +1324,14 @@ static int ipv6_get_saddr_eval(struct net *net,
 			ret = score->ifa->prefix_len;
 		score->matchlen = ret;
 		break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+		/* Optimistic addresses still have lower precedence than other
+		 * preferred addresses.
+		 */
+		ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+		break;
+#endif
 	default:
 		ret = 0;
 	}
@@ -3222,8 +3248,15 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	 * Optimistic nodes can start receiving
 	 * Frames right away
 	 */
-	if (ifp->flags & IFA_F_OPTIMISTIC)
+	if (ifp->flags & IFA_F_OPTIMISTIC) {
 		ip6_ins_rt(ifp->rt);
+		if (ipv6_use_optimistic_addr(idev)) {
+			/* Because optimistic nodes can use this address,
+			 * notify listeners. If DAD fails, RTM_DELADDR is sent.
+			 */
+			ipv6_ifa_notify(RTM_NEWADDR, ifp);
+		}
+	}
 
 	addrconf_dad_kick(ifp);
 out:
@@ -4330,6 +4363,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+	array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
@@ -5155,6 +5189,14 @@ static struct addrconf_sysctl_table
 			.proc_handler   = proc_dointvec,
 
 		},
+		{
+			.procname       = "use_optimistic",
+			.data           = &ipv6_devconf.use_optimistic,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 		{
-- 
cgit v1.2.3-58-ga151


From bc9ad166e38ae1cdcb5323a8aa45dff834d68bfa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Oct 2014 18:05:13 -0700
Subject: net: introduce napi_schedule_irqoff()

napi_schedule() can be called from any context and has to mask hard
irqs.

Add a variant that can only be called from hard interrupts handlers
or when irqs are already masked.

Many NIC drivers can use it from their hard IRQ handler instead of
generic variant.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 +++++++++++++
 net/core/dev.c            | 15 ++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 74fd5d37f15a..c85e06512246 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -386,6 +386,7 @@ typedef enum rx_handler_result rx_handler_result_t;
 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
 
 void __napi_schedule(struct napi_struct *n);
+void __napi_schedule_irqoff(struct napi_struct *n);
 
 static inline bool napi_disable_pending(struct napi_struct *n)
 {
@@ -420,6 +421,18 @@ static inline void napi_schedule(struct napi_struct *n)
 		__napi_schedule(n);
 }
 
+/**
+ *	napi_schedule_irqoff - schedule NAPI poll
+ *	@n: napi context
+ *
+ * Variant of napi_schedule(), assuming hard irqs are masked.
+ */
+static inline void napi_schedule_irqoff(struct napi_struct *n)
+{
+	if (napi_schedule_prep(n))
+		__napi_schedule_irqoff(n);
+}
+
 /* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
 static inline bool napi_reschedule(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index b793e3521a36..759940cdf896 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4372,7 +4372,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
  * __napi_schedule - schedule for receive
  * @n: entry to schedule
  *
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
  */
 void __napi_schedule(struct napi_struct *n)
 {
@@ -4384,6 +4385,18 @@ void __napi_schedule(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+	____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
 void __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-- 
cgit v1.2.3-58-ga151


From cb904b0a16305f9b2a98200cc6eb9dc3610278b0 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 2 Nov 2014 04:18:45 +0100
Subject: ieee802154: add extended address validation helper

This patch introduce an extended address validation helper to check if
an extended address is valid or not.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 6e50a2a1d485..9bba5ca7f0ad 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -197,4 +197,18 @@ static inline bool ieee802154_is_valid_psdu_len(const u8 len)
 	return (len >= IEEE802154_MIN_PSDU_LEN && len <= IEEE802154_MTU);
 }
 
+/**
+ * ieee802154_is_valid_psdu_len - check if extended addr is valid
+ * @addr: extended addr to check
+ */
+static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
+{
+	/* These EUI-64 addresses are reserved by IEEE. 0xffffffffffffffff
+	 * is used internally as extended to short address broadcast mapping.
+	 * This is currently a workaround because neighbor discovery can't
+	 * deal with short addresses types right now.
+	 */
+	return ((addr != 0x0000000000000000) || (addr != 0xffffffffffffffff));
+}
+
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3-58-ga151


From a4164eb4dd3f4f2a22f8bf7b26394e8384f3d9a2 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 2 Nov 2014 21:43:01 +0100
Subject: ieee802154: add missing ULL definition

Running make C=2 occurs warning:

constant 0xffffffffffffffff is so big it is unsigned long

This patch fix this warning by adding a ULL to the constant definitions.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reported-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 9bba5ca7f0ad..9da7c011fbba 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -208,7 +208,8 @@ static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
 	 * This is currently a workaround because neighbor discovery can't
 	 * deal with short addresses types right now.
 	 */
-	return ((addr != 0x0000000000000000) || (addr != 0xffffffffffffffff));
+	return ((addr != 0x0000000000000000ULL) ||
+		(addr != 0xffffffffffffffffULL));
 }
 
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3-58-ga151


From c28bee84c5c49312befe1b442e1044ac2392d80d Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 2 Nov 2014 21:43:02 +0100
Subject: ieee802154: fix byteorder issues

This patch fix byteorder issues which occurs because we compare __le64
with an host byteorder value. Simple add a cpu_to_le64 to convert the
host byteorder values to __le64.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Reported-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 9da7c011fbba..5d9e7459d94b 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -24,6 +24,7 @@
 #define LINUX_IEEE802154_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 #define IEEE802154_MTU			127
 #define IEEE802154_MIN_PSDU_LEN		5
@@ -208,8 +209,8 @@ static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
 	 * This is currently a workaround because neighbor discovery can't
 	 * deal with short addresses types right now.
 	 */
-	return ((addr != 0x0000000000000000ULL) ||
-		(addr != 0xffffffffffffffffULL));
+	return ((addr != cpu_to_le64(0x0000000000000000ULL)) ||
+		(addr != cpu_to_le64(0xffffffffffffffffULL)));
 }
 
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3-58-ga151


From 4cdb1e2e3d3495423db558d3bb7ed11d66aabce7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 2 Nov 2014 06:00:12 -0800
Subject: net: shrink struct softnet_data

flow_limit in struct softnet_data is only read from local cpu
and can be moved to fill a hole, reducing softnet_data size by
64 bytes on x86_64

While we are at it, move output_queue, output_queue_tailp and
completion_queue, so that rx / tx paths touch a single cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c85e06512246..5ed05bd764dc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2329,10 +2329,7 @@ extern int netdev_flow_limit_table_len;
  * Incoming packets are placed on per-cpu queues
  */
 struct softnet_data {
-	struct Qdisc		*output_queue;
-	struct Qdisc		**output_queue_tailp;
 	struct list_head	poll_list;
-	struct sk_buff		*completion_queue;
 	struct sk_buff_head	process_queue;
 
 	/* stats */
@@ -2340,10 +2337,17 @@ struct softnet_data {
 	unsigned int		time_squeeze;
 	unsigned int		cpu_collision;
 	unsigned int		received_rps;
-
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit __rcu *flow_limit;
+#endif
+	struct Qdisc		*output_queue;
+	struct Qdisc		**output_queue_tailp;
+	struct sk_buff		*completion_queue;
 
+#ifdef CONFIG_RPS
 	/* Elements below can be accessed between CPUs for RPS */
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
@@ -2355,9 +2359,6 @@ struct softnet_data {
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 
-#ifdef CONFIG_NET_FLOW_LIMIT
-	struct sd_flow_limit __rcu *flow_limit;
-#endif
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
-- 
cgit v1.2.3-58-ga151


From d475c95b4bcff983ac76e8522bfd2d29bcc567d0 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Sun, 2 Nov 2014 16:26:17 +0200
Subject: net/mlx4_core: Add retrieval of CONFIG_DEV parameters

Add code to issue CONFIG_DEV "get" firmware command.

This command is used in order to obtain certain parameters used for
supporting various RX checksumming options and vxlan UDP port.

The GET operation is allowed for VFs too.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Shani Michaeli <shanim@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |  4 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 88 +++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  5 ++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 17 +++++
 include/linux/mlx4/cmd.h                           | 29 +++++++
 include/linux/mlx4/device.h                        |  3 +-
 6 files changed, 139 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 1312ccf8b83a..3c05e5878b49 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -990,11 +990,11 @@ static struct mlx4_cmd_info cmd_info[] = {
 	{
 		.opcode = MLX4_CMD_CONFIG_DEV,
 		.has_inbox = false,
-		.has_outbox = false,
+		.has_outbox = true,
 		.out_is_imm = false,
 		.encode_slave_id = false,
 		.verify = NULL,
-		.wrapper = mlx4_CMD_EPERM_wrapper
+		.wrapper = mlx4_CONFIG_DEV_wrapper
 	},
 	{
 		.opcode = MLX4_CMD_ALLOC_RES,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e7639e31fc8a..d6dba77ae4ba 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -141,7 +141,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[12] = "Large cache line (>64B) CQE stride support",
 		[13] = "Large cache line (>64B) EQE stride support",
 		[14] = "Ethernet protocol control support",
-		[15] = "Ethernet Backplane autoneg support"
+		[15] = "Ethernet Backplane autoneg support",
+		[16] = "CONFIG DEV support"
 	};
 	int i;
 
@@ -574,6 +575,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
@@ -749,6 +751,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
@@ -1849,14 +1854,18 @@ int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 
 struct mlx4_config_dev {
 	__be32	update_flags;
-	__be32	rsdv1[3];
+	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
+	__be32	rsvd3[27];
+	__be16	rsvd4;
+	u8	rsvd5;
+	u8	rx_checksum_val;
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
 
-static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
 {
 	int err;
 	struct mlx4_cmd_mailbox *mailbox;
@@ -1874,6 +1883,77 @@ static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_
 	return err;
 }
 
+static int mlx4_CONFIG_DEV_get(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 1, MLX4_CMD_CONFIG_DEV,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (!err)
+		memcpy(config_dev, mailbox->buf, sizeof(*config_dev));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Conversion between the HW values and the actual functionality.
+ * The value represented by the array index,
+ * and the functionality determined by the flags.
+ */
+static const u8 config_dev_csum_flags[] = {
+	[0] =	0,
+	[1] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP,
+	[2] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_L4,
+	[3] =	MLX4_RX_CSUM_MODE_L4			|
+		MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_MULTI_VLAN
+};
+
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params)
+{
+	struct mlx4_config_dev config_dev;
+	int err;
+	u8 csum_mask;
+
+#define CONFIG_DEV_RX_CSUM_MODE_MASK			0x7
+#define CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET	0
+#define CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET	4
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CONFIG_DEV))
+		return -ENOTSUPP;
+
+	err = mlx4_CONFIG_DEV_get(dev, &config_dev);
+	if (err)
+		return err;
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_1 = config_dev_csum_flags[csum_mask];
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= sizeof(config_dev_csum_flags)/sizeof(config_dev_csum_flags[0]))
+		return -EINVAL;
+	params->rx_csum_flags_port_2 = config_dev_csum_flags[csum_mask];
+
+	params->vxlan_udp_dport = be16_to_cpu(config_dev.vxlan_udp_dport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_config_dev_retrieval);
+
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 {
 	struct mlx4_config_dev config_dev;
@@ -1882,7 +1962,7 @@ int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
 	config_dev.vxlan_udp_dport = udp_port;
 
-	return mlx4_CONFIG_DEV(dev, &config_dev);
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
 }
 EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 254ec7b1ca2f..f8fc7bd6f48b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -947,6 +947,11 @@ int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
 			  struct mlx4_cmd_mailbox *inbox,
 			  struct mlx4_cmd_mailbox *outbox,
 			  struct mlx4_cmd_info *cmd);
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
 int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_vhcr *vhcr,
 		     struct mlx4_cmd_mailbox *inbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 5d2498dcf536..d718ca0f88da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2872,6 +2872,23 @@ out_add:
 	return err;
 }
 
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	u8 get = vhcr->op_modifier;
+
+	if (get != 1)
+		return -EPERM;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
 static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
 			      int len, struct res_mtt **res)
 {
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index ff5f5deb3dcf..64d25941b329 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -199,6 +199,33 @@ enum {
 	MLX4_CMD_NATIVE
 };
 
+/*
+ * MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP -
+ * Receive checksum value is reported in CQE also for non TCP/UDP packets.
+ *
+ * MLX4_RX_CSUM_MODE_L4 -
+ * L4_CSUM bit in CQE, which indicates whether or not L4 checksum
+ * was validated correctly, is supported.
+ *
+ * MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP -
+ * IP_OK CQE's field is supported also for non TCP/UDP IP packets.
+ *
+ * MLX4_RX_CSUM_MODE_MULTI_VLAN -
+ * Receive Checksum offload is supported for packets with more than 2 vlan headers.
+ */
+enum mlx4_rx_csum_mode {
+	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP		= 1UL << 0,
+	MLX4_RX_CSUM_MODE_L4				= 1UL << 1,
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP		= 1UL << 2,
+	MLX4_RX_CSUM_MODE_MULTI_VLAN			= 1UL << 3
+};
+
+struct mlx4_config_dev_params {
+	u16	vxlan_udp_dport;
+	u8	rx_csum_flags_port_1;
+	u8	rx_csum_flags_port_2;
+};
+
 struct mlx4_dev;
 
 struct mlx4_cmd_mailbox {
@@ -250,6 +277,8 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
 int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
 int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf);
 int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e4c136ebe79b..5cc5eac47d1b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -188,7 +188,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
 	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
 	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
-	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
+	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16
 };
 
 enum {
-- 
cgit v1.2.3-58-ga151


From 56b174256b6936ec4c1ed8f3407109ac6929d3ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 3 Nov 2014 08:19:53 -0800
Subject: net: add rbnode to struct sk_buff

Yaogong replaces TCP out of order receive queue by an RB tree.

As netem already does a private skb->{next/prev/tstamp} union
with a 'struct rb_node', lets do this in a cleaner way.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yaogong Wang <wygivan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 20 +++++++++++++-------
 net/sched/sch_netem.c  | 27 +++++++--------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6c8b6f604e76..5ad9675b6fe1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/bug.h>
 #include <linux/cache.h>
+#include <linux/rbtree.h>
 
 #include <linux/atomic.h>
 #include <asm/types.h>
@@ -440,6 +441,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  *	@next: Next buffer in list
  *	@prev: Previous buffer in list
  *	@tstamp: Time we arrived/left
+ *	@rbnode: RB tree node, alternative to next/prev for netem/tcp
  *	@sk: Socket we are owned by
  *	@dev: Device we arrived on/are leaving by
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
@@ -504,15 +506,19 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  */
 
 struct sk_buff {
-	/* These two members must be first. */
-	struct sk_buff		*next;
-	struct sk_buff		*prev;
-
 	union {
-		ktime_t		tstamp;
-		struct skb_mstamp skb_mstamp;
+		struct {
+			/* These two members must be first. */
+			struct sk_buff		*next;
+			struct sk_buff		*prev;
+
+			union {
+				ktime_t		tstamp;
+				struct skb_mstamp skb_mstamp;
+			};
+		};
+		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
-
 	struct sock		*sk;
 	struct net_device	*dev;
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b34331967e02..179f1c8c0d8b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -139,33 +139,20 @@ struct netem_sched_data {
 
 /* Time stamp put into socket buffer control block
  * Only valid when skbs are in our internal t(ime)fifo queue.
+ *
+ * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
+ * and skb->next & skb->prev are scratch space for a qdisc,
+ * we save skb->tstamp value in skb->cb[] before destroying it.
  */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
 	ktime_t		tstamp_save;
 };
 
-/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
- * to hold a rb_node structure.
- *
- * If struct sk_buff layout is changed, the following checks will complain.
- */
-static struct rb_node *netem_rb_node(struct sk_buff *skb)
-{
-	BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
-	BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
-		     offsetof(struct sk_buff, next) + sizeof(skb->next));
-	BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
-		     offsetof(struct sk_buff, prev) + sizeof(skb->prev));
-	BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
-					      sizeof(skb->prev) +
-					      sizeof(skb->tstamp));
-	return (struct rb_node *)&skb->next;
-}
 
 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
 {
-	return (struct sk_buff *)rb;
+	return container_of(rb, struct sk_buff, rbnode);
 }
 
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
@@ -403,8 +390,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		else
 			p = &parent->rb_left;
 	}
-	rb_link_node(netem_rb_node(nskb), parent, p);
-	rb_insert_color(netem_rb_node(nskb), &q->t_root);
+	rb_link_node(&nskb->rbnode, parent, p);
+	rb_insert_color(&nskb->rbnode, &q->t_root);
 	sch->q.qlen++;
 }
 
-- 
cgit v1.2.3-58-ga151


From 98a18b6ffc79baa69f4a0d1bae58faf2a8aef4c8 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Sun, 2 Nov 2014 06:44:54 +0100
Subject: netdevice: add ieee802154_ptr to net_device

This patch adds an ieee802154_ptr to the net_device structure.
Furthermore the 802.15.4 subsystem will introduce a nl802154 framework
which is similar like the nl80211 framework and a wpan_dev structure.
The wpan_dev structure will hold additional net_device attributes like
address options which are 802.15.4 specific. In the upcoming nl802154
implementation we will introduce a NL802154_FLAG_NEED_WPAN_DEV like
NL80211_FLAG_NEED_WDEV. For this flag an ieee802154_ptr in net_device is
needed. Additional we can access the wpan_dev attributes in upper layers
like IEEE 802.15.4 6LoWPAN easily. Current solution is a complicated
callback interface and getting these values over subif data structure
in mac802154.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/netdevice.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 74fd5d37f15a..c9bcf33efb47 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -57,6 +57,8 @@ struct device;
 struct phy_device;
 /* 802.11 specific */
 struct wireless_dev;
+/* 802.15.4 specific */
+struct wpan_dev;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1572,6 +1574,7 @@ struct net_device {
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
 	struct wireless_dev	*ieee80211_ptr;
+	struct wpan_dev		*ieee802154_ptr;
 
 /*
  * Cache lines mostly used on receive path (including eth_type_trans())
-- 
cgit v1.2.3-58-ga151


From 0563921abf01a7a38b5f670c3de05dc0b0b8617d Mon Sep 17 00:00:00 2001
From: Eran Harary <eran.harary@intel.com>
Date: Mon, 3 Nov 2014 20:06:47 +0200
Subject: ieee80211: add "max length of AMPDU" enum for VHT

Maximum length of AMPDU that an STA can receive in VHT.
length = 2 ^ (13 + max_ampdu_length_exp) - 1.

Signed-off-by: Eran Harary <eran.harary@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 5fab17b382b5..f65b5446d983 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1274,7 +1274,7 @@ struct ieee80211_ht_cap {
 #define		IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT	2
 
 /*
- * Maximum length of AMPDU that the STA can receive.
+ * Maximum length of AMPDU that the STA can receive in high-throughput (HT).
  * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
  */
 enum ieee80211_max_ampdu_length_exp {
@@ -1284,6 +1284,21 @@ enum ieee80211_max_ampdu_length_exp {
 	IEEE80211_HT_MAX_AMPDU_64K = 3
 };
 
+/*
+ * Maximum length of AMPDU that the STA can receive in VHT.
+ * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
+ */
+enum ieee80211_vht_max_ampdu_length_exp {
+	IEEE80211_VHT_MAX_AMPDU_8K = 0,
+	IEEE80211_VHT_MAX_AMPDU_16K = 1,
+	IEEE80211_VHT_MAX_AMPDU_32K = 2,
+	IEEE80211_VHT_MAX_AMPDU_64K = 3,
+	IEEE80211_VHT_MAX_AMPDU_128K = 4,
+	IEEE80211_VHT_MAX_AMPDU_256K = 5,
+	IEEE80211_VHT_MAX_AMPDU_512K = 6,
+	IEEE80211_VHT_MAX_AMPDU_1024K = 7
+};
+
 #define IEEE80211_HT_MAX_AMPDU_FACTOR 13
 
 /* Minimum MPDU start spacing */
-- 
cgit v1.2.3-58-ga151


From 0d8a52f933f817d0b62955a5a362fb7f2508f06c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 4 Nov 2014 11:55:09 +0300
Subject: ieee802154: || vs && in ieee802154_is_valid_extended_addr()

The ieee802154_is_valid_extended_addr() always returns true because
there is a typo.  The || should be &&.  Neither 0x0000000000000000ULL
nor 0xffffffffffffffffULL are valid addresses.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 5d9e7459d94b..4c032863cd71 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -209,7 +209,7 @@ static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
 	 * This is currently a workaround because neighbor discovery can't
 	 * deal with short addresses types right now.
 	 */
-	return ((addr != cpu_to_le64(0x0000000000000000ULL)) ||
+	return ((addr != cpu_to_le64(0x0000000000000000ULL)) &&
 		(addr != cpu_to_le64(0xffffffffffffffffULL)));
 }
 
-- 
cgit v1.2.3-58-ga151


From 1906bbbddbe085b19be2c21cd132335260f551c3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 5 Nov 2014 20:51:21 +0100
Subject: ieee802154: add IEEE802154_EXTENDED_ADDR_LEN

This patch adds a new define for getting the length of an extended
address.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 4c032863cd71..a907fe59b1d1 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -29,6 +29,8 @@
 #define IEEE802154_MTU			127
 #define IEEE802154_MIN_PSDU_LEN		5
 
+#define IEEE802154_EXTENDED_ADDR_LEN	8
+
 #define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
 #define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
 #define IEEE802154_FC_TYPE_ACK		0x2	/* Frame is acknowledgment */
-- 
cgit v1.2.3-58-ga151


From 35d5a374a559a1ba9c6810739cf3ad1d672c2de2 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 5 Nov 2014 20:51:22 +0100
Subject: ieee802154: add ieee802154_random_extended_addr

This patch adds a new function to generate a random IEEE 802.15.4
extended address.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index a907fe59b1d1..d043449a079d 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -24,6 +24,7 @@
 #define LINUX_IEEE802154_H
 
 #include <linux/types.h>
+#include <linux/random.h>
 #include <asm/byteorder.h>
 
 #define IEEE802154_MTU			127
@@ -215,4 +216,17 @@ static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
 		(addr != cpu_to_le64(0xffffffffffffffffULL)));
 }
 
+/**
+ * ieee802154_random_extended_addr - generates a random extended address
+ * @addr: extended addr pointer to place the random address
+ */
+static inline void ieee802154_random_extended_addr(__le64 *addr)
+{
+	get_random_bytes(addr, IEEE802154_EXTENDED_ADDR_LEN);
+
+	/* toggle some bit if we hit an invalid extended addr */
+	if (!ieee802154_is_valid_extended_addr(*addr))
+		((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] ^= 0x01;
+}
+
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3-58-ga151


From e585f23636370320bc2071ca5ba2744ae37c3e51 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 4 Nov 2014 09:06:54 -0800
Subject: udp: Changes to udp_offload to support remote checksum offload

Add a new GSO type, SKB_GSO_TUNNEL_REMCSUM, which indicates remote
checksum offload being done (in this case inner checksum must not
be offloaded to the NIC).

Added logic in __skb_udp_tunnel_segment to handle remote checksum
offload case.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  4 +++-
 include/linux/netdevice.h       |  1 +
 include/linux/skbuff.h          |  4 +++-
 net/core/skbuff.c               |  4 ++--
 net/ipv4/af_inet.c              |  1 +
 net/ipv4/tcp_offload.c          |  1 +
 net/ipv4/udp_offload.c          | 18 ++++++++++++++++--
 net/ipv6/ip6_offload.c          |  1 +
 net/ipv6/udp_offload.c          |  1 +
 9 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index dcfdecbfa0b7..8c94b07e654a 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,8 +48,9 @@ enum {
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
 	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
+	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_MPLS_BIT,
+		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CSUM_BIT,		/* SCTP checksum offload */
@@ -119,6 +120,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
+#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5ed05bd764dc..4767f546d7c0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3584,6 +3584,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ad9675b6fe1..74ed34413969 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -373,6 +373,7 @@ enum {
 
 	SKB_GSO_MPLS = 1 << 12,
 
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
@@ -603,7 +604,8 @@ struct sk_buff {
 #endif
 	__u8			ipvs_property:1;
 	__u8			inner_protocol_type:1;
-	/* 4 or 6 bit hole */
+	__u8			remcsum_offload:1;
+	/* 3 or 5 bit hole */
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e48e5c02e877..700189604f3d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3013,7 +3013,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 		if (nskb->len == len + doffset)
 			goto perform_csum_check;
 
-		if (!sg) {
+		if (!sg && !nskb->remcsum_offload) {
 			nskb->ip_summed = CHECKSUM_NONE;
 			nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
 							    skb_put(nskb, len),
@@ -3085,7 +3085,7 @@ skip_fraglist:
 		nskb->truesize += nskb->data_len;
 
 perform_csum_check:
-		if (!csum) {
+		if (!csum && !nskb->remcsum_offload) {
 			nskb->csum = skb_checksum(nskb, doffset,
 						  nskb->len - doffset, 0);
 			nskb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8b7fe5b03906..ed2c672c5b01 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1222,6 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TUNNEL_REMCSUM |
 		       SKB_GSO_MPLS |
 		       0)))
 		goto out;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5b90f2f447a5..a1b2a5624f91 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -97,6 +97,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       SKB_GSO_UDP_TUNNEL_CSUM |
+			       SKB_GSO_TUNNEL_REMCSUM |
 			       0) ||
 			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
 			goto out;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a774711a88b9..0a5a70d0e84c 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -41,7 +41,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	unsigned int oldlen;
 	bool need_csum = !!(skb_shinfo(skb)->gso_type &
 			    SKB_GSO_UDP_TUNNEL_CSUM);
-	bool offload_csum = false, dont_encap = need_csum;
+	bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+	bool offload_csum = false, dont_encap = (need_csum || remcsum);
 
 	oldlen = (u16)~skb->len;
 
@@ -55,6 +56,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	skb->mac_len = skb_inner_network_offset(skb);
 	skb->protocol = new_protocol;
 	skb->encap_hdr_csum = need_csum;
+	skb->remcsum_offload = remcsum;
 
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
@@ -108,11 +110,22 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		uh->check = ~csum_fold((__force __wsum)
 				       ((__force u32)uh->check +
 					(__force u32)delta));
-
 		if (offload_csum) {
 			skb->ip_summed = CHECKSUM_PARTIAL;
 			skb->csum_start = skb_transport_header(skb) - skb->head;
 			skb->csum_offset = offsetof(struct udphdr, check);
+		} else if (remcsum) {
+			/* Need to calculate checksum from scratch,
+			 * inner checksums are never when doing
+			 * remote_checksum_offload.
+			 */
+
+			skb->csum = skb_checksum(skb, udp_offset,
+						 skb->len - udp_offset,
+						 0);
+			uh->check = csum_fold(skb->csum);
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
 		} else {
 			uh->check = gso_make_checksum(skb, ~uh->check);
 
@@ -192,6 +205,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
+				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_IPIP |
 				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
 				      SKB_GSO_MPLS) ||
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index a071563a7e6e..e9767079a360 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -78,6 +78,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_SIT |
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
+		       SKB_GSO_TUNNEL_REMCSUM |
 		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
 		       0)))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 6b8f543f6ac6..637ba2e438b7 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -42,6 +42,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_DODGY |
 				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_UDP_TUNNEL_CSUM |
+				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_GRE |
 				      SKB_GSO_GRE_CSUM |
 				      SKB_GSO_IPIP |
-- 
cgit v1.2.3-58-ga151


From 51f3d02b980a338cd291d2bc7629cdfb2568424b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 5 Nov 2014 16:46:40 -0500
Subject: net: Add and use skb_copy_datagram_msg() helper.

This encapsulates all of the skb_copy_datagram_iovec() callers
with call argument signature "skb, offset, msghdr->msg_iov, length".

When we move to iov_iters in the networking, the iov_iter object will
sit in the msghdr.

Having a helper like this means there will be less places to touch
during that transformation.

Based upon descriptions and patch from Al Viro.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/mISDN/socket.c    | 2 +-
 drivers/net/ppp/pppoe.c        | 2 +-
 include/linux/skbuff.h         | 6 ++++++
 net/appletalk/ddp.c            | 2 +-
 net/atm/common.c               | 2 +-
 net/ax25/af_ax25.c             | 2 +-
 net/bluetooth/af_bluetooth.c   | 4 ++--
 net/bluetooth/hci_sock.c       | 2 +-
 net/caif/caif_socket.c         | 2 +-
 net/core/sock.c                | 2 +-
 net/dccp/proto.c               | 2 +-
 net/ieee802154/dgram.c         | 2 +-
 net/ieee802154/raw.c           | 2 +-
 net/ipv4/ip_sockglue.c         | 2 +-
 net/ipv4/ping.c                | 2 +-
 net/ipv4/raw.c                 | 2 +-
 net/ipv4/tcp.c                 | 5 ++---
 net/ipv4/udp.c                 | 4 ++--
 net/ipv6/datagram.c            | 4 ++--
 net/ipv6/raw.c                 | 4 ++--
 net/ipv6/udp.c                 | 4 ++--
 net/ipx/af_ipx.c               | 3 +--
 net/irda/af_irda.c             | 2 +-
 net/iucv/af_iucv.c             | 2 +-
 net/key/af_key.c               | 2 +-
 net/l2tp/l2tp_ip.c             | 2 +-
 net/l2tp/l2tp_ip6.c            | 2 +-
 net/l2tp/l2tp_ppp.c            | 2 +-
 net/llc/af_llc.c               | 3 +--
 net/netlink/af_netlink.c       | 2 +-
 net/netrom/af_netrom.c         | 2 +-
 net/nfc/llcp_sock.c            | 2 +-
 net/nfc/rawsock.c              | 2 +-
 net/packet/af_packet.c         | 2 +-
 net/phonet/datagram.c          | 2 +-
 net/phonet/pep.c               | 2 +-
 net/rose/af_rose.c             | 2 +-
 net/rxrpc/ar-recvmsg.c         | 2 +-
 net/sctp/socket.c              | 2 +-
 net/tipc/socket.c              | 7 +++----
 net/unix/af_unix.c             | 6 +++---
 net/vmw_vsock/vmci_transport.c | 3 +--
 net/x25/af_x25.c               | 2 +-
 43 files changed, 58 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1be82284cf9d..dcbd8589f0c4 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb),
 	       MISDN_HEADER_LEN);
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	mISDN_sock_cmsg(sk, msg, skb);
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 6c9c16d76935..443cbbf5c55f 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (skb) {
 		total_len = min_t(size_t, total_len, skb->len);
-		error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len);
+		error = skb_copy_datagram_msg(skb, 0, m, total_len);
 		if (error == 0) {
 			consume_skb(skb);
 			return total_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 74ed34413969..39ec7530ae27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -21,6 +21,7 @@
 #include <linux/bug.h>
 #include <linux/cache.h>
 #include <linux/rbtree.h>
+#include <linux/socket.h>
 
 #include <linux/atomic.h>
 #include <asm/types.h>
@@ -2639,6 +2640,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
 int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
 			    struct iovec *to, int size);
+static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
+					struct msghdr *msg, int size)
+{
+	return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size);
+}
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index c00897f65a31..425942db17f6 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, offset, msg, copied);
 
 	if (!err && msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name);
diff --git a/net/atm/common.c b/net/atm/common.c
index 6a765156a3f6..9cd1ccae9a11 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	error = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (error)
 		return error;
 	sock_recv_ts_and_drops(msg, sk, skb);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c35c3f48fc0f..f4f835e19378 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		ax25_digi digi;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 339c74ad4553..0a7cc565f93e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
@@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 
 		chunk = min_t(unsigned int, skb->len, size);
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (!copied)
 				copied = -EFAULT;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 115f149362ba..29e1ec7189bd 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	switch (hci_pi(sk)->channel) {
 	case HCI_CHANNEL_RAW:
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 43f750e88e19..fbcd156099fb 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copylen = len;
 	}
 
-	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
+	ret = skb_copy_datagram_msg(skb, 0, m, copylen);
 	if (ret)
 		goto out_free;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67b1069..ac56dd06c306 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5ab6627cf370..8e6ae9422a7b 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -896,7 +896,7 @@ verify_sock_status:
 		else if (len < skb->len)
 			msg->msg_flags |= MSG_TRUNC;
 
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, len)) {
 			/* Exception. Bailout! */
 			len = -EFAULT;
 			break;
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index ef2ad8aaef13..fc9193eabd41 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* FIXME: skip headers if necessary ?! */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 9d1f64806f02..73a4d53463de 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c373a9ad4555..21894df66262 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 57f7c9804139..736236c3e554 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 
 	/* Don't bother checking the checksum */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db3100c23..ee8fa4bf3b73 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39ec0c379545..c239f4740d10 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 	/* XXX -- need to support SO_PEEK_OFF */
 
 	skb_queue_walk(&sk->sk_write_queue, skb) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
 		if (err)
 			break;
 
@@ -1833,8 +1833,7 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
+			err = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (err) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3f001db72351..df19027f44f3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1281,8 +1281,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					    msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb,
 						       sizeof(struct udphdr),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2cdc38338be3..5c6996e44b14 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
@@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 075a0fb400e7..0cbcf98f2cab 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	if (skb_csum_unnecessary(skb)) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else if (msg->msg_flags&MSG_TRUNC) {
 		if (__skb_checksum_complete(skb))
 			goto csum_copy_err;
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else {
 		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f6ba535b6feb..9b6809232b17 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -424,8 +424,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					    msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 313ef4644069..a0c75366c93b 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1805,8 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov,
-				     copied);
+	rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
 	if (rc)
 		goto out_free;
 	if (skb->tstamp.tv64)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92fafd485deb..980bc2670a13 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a089b6b91650..057b5647ef92 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, offset, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1847ec4e3930..e5883091a8c6 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 369a9822488c..a6cc1fed2b52 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 0edb263cc002..2177b960da87 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b704a9356208..c559bcdf4679 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (len < skb->len)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (likely(err == 0))
 		err = len;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index bb9cbc17d926..af662669f951 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -819,8 +819,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 			used = len;
 
 		if (!(flags & MSG_TRUNC)) {
-			int rc = skb_copy_datagram_iovec(skb, offset,
-							 msg->msg_iov, used);
+			int rc = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (rc) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1de72de273e..580b79452bec 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 1b06a1fcf3e8..7e13f6afcd1f 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	er = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (er < 0) {
 		skb_free_datagram(sk, skb);
 		release_sock(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 51f077a92fa9..83bc785d5855 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	copied = min_t(unsigned int, rlen, len);
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, 0, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 11c3544ea546..9d7d2b7ba5e4 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied = len;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d20f48ff06..4cd13d8de44b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 290352c0e6b4..0918bc21eae6 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copylen = len;
 	}
 
-	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	rval = skb_copy_datagram_msg(skb, 0, msg, copylen);
 	if (rval) {
 		rval = -EFAULT;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 70a547ea5177..44b2123e22b8 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1296,7 +1296,7 @@ copy:
 	else
 		len = skb->len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (!err)
 		err = (flags & MSG_TRUNC) ? skb->len : len;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a85c1a086ae4..9b600c20a7a3 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		struct sockaddr_rose *srose;
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index e9aaa65c0778..4575485ad1b4 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (copy > len - copied)
 			copy = len - copied;
 
-		ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy);
+		ret = skb_copy_datagram_msg(skb, offset, msg, copy);
 
 		if (ret < 0)
 			goto copy_error;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 634a2abb5f3a..2120292c842d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (copied > len)
 		copied = len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	event = sctp_skb2event(skb);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ad8a1a1e2275..591bbfa082a0 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1372,8 +1372,7 @@ restart:
 			sz = buf_len;
 			m->msg_flags |= MSG_TRUNC;
 		}
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
-					      m->msg_iov, sz);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz);
 		if (res)
 			goto exit;
 		res = sz;
@@ -1473,8 +1472,8 @@ restart:
 		needed = (buf_len - sz_copied);
 		sz_to_copy = (sz <= needed) ? sz : needed;
 
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
-					      m->msg_iov, sz_to_copy);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset,
+					    m, sz_to_copy);
 		if (res)
 			goto exit;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e96884380732..5eee625d113f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (size < skb->len - skip)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
+	err = skb_copy_datagram_msg(skb, skip, msg, size);
 	if (err)
 		goto out_free;
 
@@ -2030,8 +2030,8 @@ again:
 		}
 
 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
-		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
-					    msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+					  msg, chunk)) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 9bb63ffec4f2..a57ddef7d5af 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
 	}
 
 	/* Place the datagram payload in the user's iovec. */
-	err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov,
-		payload_len);
+	err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len);
 	if (err)
 		goto out;
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5ad4418ef093..59e785bfde65 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	/* Currently, each datagram always contains a complete record */
 	msg->msg_flags |= MSG_EOR;
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (rc)
 		goto out_free_dgram;
 
-- 
cgit v1.2.3-58-ga151


From 25de4668d094f00e44a8f2428dd3c1a4ecfa0053 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 4 Nov 2014 10:59:47 -0800
Subject: ipv6: move INET6_MATCH() to include/net/inet6_hashtables.h

It is only used in net/ipv6/inet6_hashtables.c.

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h           | 10 ----------
 include/net/inet6_hashtables.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7121a2e97ce2..c694e7baa621 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -317,14 +317,4 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 #define tcp_twsk_ipv6only(__sk)		0
 #define inet_v6_ipv6only(__sk)		0
 #endif /* IS_ENABLED(CONFIG_IPV6) */
-
-#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
-	(((__sk)->sk_portpair == (__ports))			&&	\
-	 ((__sk)->sk_family == AF_INET6)			&&	\
-	 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))		&&	\
-	 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))	&&	\
-	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif))) 		&&	\
-	 net_eq(sock_net(__sk), (__net)))
-
 #endif /* _IPV6_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index d1d272843b3b..9201afe083fa 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -99,4 +99,14 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  const struct in6_addr *daddr, const __be16 dport,
 			  const int dif);
 #endif /* IS_ENABLED(CONFIG_IPV6) */
+
+#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
+	(((__sk)->sk_portpair == (__ports))			&&	\
+	 ((__sk)->sk_family == AF_INET6)			&&	\
+	 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))		&&	\
+	 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))	&&	\
+	 (!(__sk)->sk_bound_dev_if	||				\
+	   ((__sk)->sk_bound_dev_if == (__dif))) 		&&	\
+	 net_eq(sock_net(__sk), (__net)))
+
 #endif /* _INET6_HASHTABLES_H */
-- 
cgit v1.2.3-58-ga151


From e5a2c899957659cd1a9f789bc462f9c0b35f5150 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 5 Nov 2014 00:23:04 +0100
Subject: fast_hash: avoid indirect function calls

By default the arch_fast_hash hashing function pointers are initialized
to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
updated to the CRC32 ones. This dispatching scheme incurs a function
pointer lookup and indirect call for every hashing operation.

rhashtable as a user of arch_fast_hash e.g. stores pointers to hashing
functions in its structure, too, causing two indirect branches per
hashing operation.

Using alternative_call we can get away with one of those indirect branches.

Acked-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/hash.h | 51 ++++++++++++++++++++++++++++++++++++++++-----
 arch/x86/lib/hash.c         | 29 +++++++++++++++-----------
 include/asm-generic/hash.h  | 36 ++++++++++++++++++++++++++++++--
 include/linux/hash.h        | 34 ------------------------------
 lib/Makefile                |  2 +-
 lib/hash.c                  | 39 ----------------------------------
 6 files changed, 98 insertions(+), 93 deletions(-)
 delete mode 100644 lib/hash.c

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
index e8c58f88b1d4..a881d784f044 100644
--- a/arch/x86/include/asm/hash.h
+++ b/arch/x86/include/asm/hash.h
@@ -1,7 +1,48 @@
-#ifndef _ASM_X86_HASH_H
-#define _ASM_X86_HASH_H
+#ifndef __ASM_X86_HASH_H
+#define __ASM_X86_HASH_H
 
-struct fast_hash_ops;
-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
+#include <linux/cpufeature.h>
+#include <asm/alternative.h>
 
-#endif /* _ASM_X86_HASH_H */
+u32 __intel_crc4_2_hash(const void *data, u32 len, u32 seed);
+u32 __intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed);
+
+/*
+ * non-inline versions of jhash so gcc does not need to generate
+ * duplicate code in every object file
+ */
+u32 __jhash(const void *data, u32 len, u32 seed);
+u32 __jhash2(const u32 *data, u32 len, u32 seed);
+
+/*
+ * for documentation of these functions please look into
+ * <include/asm-generic/hash.h>
+ */
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash, __intel_crc4_2_hash, X86_FEATURE_XMM4_2,
+#ifdef CONFIG_X86_64
+			 "=a" (hash), "D" (data), "S" (len), "d" (seed));
+#else
+			 "=a" (hash), "a" (data), "d" (len), "c" (seed));
+#endif
+	return hash;
+}
+
+static inline u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash2, __intel_crc4_2_hash2, X86_FEATURE_XMM4_2,
+#ifdef CONFIG_X86_64
+			 "=a" (hash), "D" (data), "S" (len), "d" (seed));
+#else
+			 "=a" (hash), "a" (data), "d" (len), "c" (seed));
+#endif
+	return hash;
+}
+
+#endif /* __ASM_X86_HASH_H */
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
index ff4fa51a5b1f..e14327198835 100644
--- a/arch/x86/lib/hash.c
+++ b/arch/x86/lib/hash.c
@@ -31,13 +31,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/hash.h>
-#include <linux/init.h>
-
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 #include <asm/hash.h>
 
+#include <linux/hash.h>
+#include <linux/jhash.h>
+
 static inline u32 crc32_u32(u32 crc, u32 val)
 {
 #ifdef CONFIG_AS_CRC32
@@ -48,7 +48,7 @@ static inline u32 crc32_u32(u32 crc, u32 val)
 	return crc;
 }
 
-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
+u32 __intel_crc4_2_hash(const void *data, u32 len, u32 seed)
 {
 	const u32 *p32 = (const u32 *) data;
 	u32 i, tmp = 0;
@@ -71,22 +71,27 @@ static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
 
 	return seed;
 }
+EXPORT_SYMBOL(__intel_crc4_2_hash);
 
-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
+u32 __intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
 {
-	const u32 *p32 = (const u32 *) data;
 	u32 i;
 
 	for (i = 0; i < len; i++)
-		seed = crc32_u32(seed, *p32++);
+		seed = crc32_u32(seed, *data++);
 
 	return seed;
 }
+EXPORT_SYMBOL(__intel_crc4_2_hash2);
 
-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
+u32 __jhash(const void *data, u32 len, u32 seed)
 {
-	if (cpu_has_xmm4_2) {
-		ops->hash  = intel_crc4_2_hash;
-		ops->hash2 = intel_crc4_2_hash2;
-	}
+	return jhash(data, len, seed);
+}
+EXPORT_SYMBOL(__jhash);
+
+u32 __jhash2(const u32 *data, u32 len, u32 seed)
+{
+	return jhash2(data, len, seed);
 }
+EXPORT_SYMBOL(__jhash2);
diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
index b6312843dbd9..3c82760ff2a4 100644
--- a/include/asm-generic/hash.h
+++ b/include/asm-generic/hash.h
@@ -1,9 +1,41 @@
 #ifndef __ASM_GENERIC_HASH_H
 #define __ASM_GENERIC_HASH_H
 
-struct fast_hash_ops;
-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
+#include <linux/jhash.h>
+
+/**
+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
+ *			 arbitrary size. This function will eventually use an
+ *			 architecture-optimized hashing implementation if
+ *			 available, and trades off distribution for speed.
+ *
+ *	@data: buffer to hash
+ *	@len: length of buffer in bytes
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	return jhash(data, len, seed);
+}
+
+/**
+ *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
+ *			  size that is of a multiple of 32bit words. This
+ *			  function will eventually use an architecture-
+ *			  optimized hashing implementation if available,
+ *			  and trades off distribution for speed.
+ *
+ *	@data: buffer to hash (must be 32bit padded)
+ *	@len: number of 32bit words
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+static inline u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
 {
+	return jhash2(data, len, seed);
 }
 
 #endif /* __ASM_GENERIC_HASH_H */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index d0494c399392..6e8fb028848c 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -84,38 +84,4 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }
 
-struct fast_hash_ops {
-	u32 (*hash)(const void *data, u32 len, u32 seed);
-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
-};
-
-/**
- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
- *			 arbitrary size. This function will eventually use an
- *			 architecture-optimized hashing implementation if
- *			 available, and trades off distribution for speed.
- *
- *	@data: buffer to hash
- *	@len: length of buffer in bytes
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
-
-/**
- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
- *			  size that is of a multiple of 32bit words. This
- *			  function will eventually use an architecture-
- *			  optimized hashing implementation if available,
- *			  and trades off distribution for speed.
- *
- *	@data: buffer to hash (must be 32bit padded)
- *	@len: number of 32bit words
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
-
 #endif /* _LINUX_HASH_H */
diff --git a/lib/Makefile b/lib/Makefile
index 7512dc978f18..04e53dd16070 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o
+	 percpu-refcount.o percpu_ida.o rhashtable.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/hash.c b/lib/hash.c
deleted file mode 100644
index fea973f4bd57..000000000000
--- a/lib/hash.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* General purpose hashing library
- *
- * That's a start of a kernel hashing library, which can be extended
- * with further algorithms in future. arch_fast_hash{2,}() will
- * eventually resolve to an architecture optimized implementation.
- *
- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
- */
-
-#include <linux/jhash.h>
-#include <linux/hash.h>
-#include <linux/cache.h>
-
-static struct fast_hash_ops arch_hash_ops __read_mostly = {
-	.hash  = jhash,
-	.hash2 = jhash2,
-};
-
-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash);
-
-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash2(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash2);
-
-static int __init hashlib_init(void)
-{
-	setup_arch_fast_hash(&arch_hash_ops);
-	return 0;
-}
-early_initcall(hashlib_init);
-- 
cgit v1.2.3-58-ga151


From 59b93b41e7fa71138734a911b11b044340dd16bd Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Wed, 5 Nov 2014 15:27:48 -0800
Subject: net: Remove MPLS GSO feature.

Device can export MPLS GSO support in dev->mpls_features same way
it export vlan features in dev->vlan_features. So it is safe to
remove NETIF_F_GSO_MPLS redundant flag.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 include/linux/netdev_features.h | 5 +----
 include/linux/netdevice.h       | 1 -
 include/linux/skbuff.h          | 4 +---
 net/core/ethtool.c              | 1 -
 net/ipv4/af_inet.c              | 1 -
 net/ipv4/tcp_offload.c          | 1 -
 net/ipv4/udp_offload.c          | 3 +--
 net/ipv6/ip6_offload.c          | 1 -
 net/ipv6/udp_offload.c          | 3 +--
 net/mpls/mpls_gso.c             | 3 +--
 10 files changed, 5 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8c94b07e654a..8e30685affeb 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -47,7 +47,6 @@ enum {
 	NETIF_F_GSO_SIT_BIT,		/* ... SIT tunnel with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
 	NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
-	NETIF_F_GSO_MPLS_BIT,		/* ... MPLS segmentation */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
 		NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -119,7 +118,6 @@ enum {
 #define NETIF_F_GSO_SIT		__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
-#define NETIF_F_GSO_MPLS	__NETIF_F(GSO_MPLS)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
@@ -183,7 +181,6 @@ enum {
 				 NETIF_F_GSO_IPIP |			\
 				 NETIF_F_GSO_SIT |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
-				 NETIF_F_GSO_UDP_TUNNEL_CSUM |		\
-				 NETIF_F_GSO_MPLS)
+				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
 #endif	/* _LINUX_NETDEV_FEATURES_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4767f546d7c0..90ac95900a11 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3583,7 +3583,6 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SIT     != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_MPLS    != (NETIF_F_GSO_MPLS >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39ec7530ae27..53f4f6c93356 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -372,9 +372,7 @@ enum {
 
 	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
 
-	SKB_GSO_MPLS = 1 << 12,
-
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 06dfb293e5aa..b0f84f5ddda8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_IPIP_BIT] =	 "tx-ipip-segmentation",
 	[NETIF_F_GSO_SIT_BIT] =		 "tx-sit-segmentation",
 	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
-	[NETIF_F_GSO_MPLS_BIT] =	 "tx-mpls-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2c672c5b01..3a096bb2d596 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1223,7 +1223,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_MPLS |
 		       0)))
 		goto out;
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a1b2a5624f91..9d7930ba8e0f 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -94,7 +94,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 			       SKB_GSO_GRE_CSUM |
 			       SKB_GSO_IPIP |
 			       SKB_GSO_SIT |
-			       SKB_GSO_MPLS |
 			       SKB_GSO_UDP_TUNNEL |
 			       SKB_GSO_UDP_TUNNEL_CSUM |
 			       SKB_GSO_TUNNEL_REMCSUM |
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0a5a70d0e84c..d3e537ef6b7f 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -207,8 +207,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_UDP_TUNNEL_CSUM |
 				      SKB_GSO_TUNNEL_REMCSUM |
 				      SKB_GSO_IPIP |
-				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
-				      SKB_GSO_MPLS) ||
+				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index e9767079a360..fd76ce938c32 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -79,7 +79,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_UDP_TUNNEL_CSUM |
 		       SKB_GSO_TUNNEL_REMCSUM |
-		       SKB_GSO_MPLS |
 		       SKB_GSO_TCPV6 |
 		       0)))
 		goto out;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 637ba2e438b7..b6aa8ed18257 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -46,8 +46,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 				      SKB_GSO_GRE |
 				      SKB_GSO_GRE_CSUM |
 				      SKB_GSO_IPIP |
-				      SKB_GSO_SIT |
-				      SKB_GSO_MPLS) ||
+				      SKB_GSO_SIT) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
 
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index e3545f21a099..ca27837974fe 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -34,8 +34,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 				  SKB_GSO_TCP_ECN |
 				  SKB_GSO_GRE |
 				  SKB_GSO_GRE_CSUM |
-				  SKB_GSO_IPIP |
-				  SKB_GSO_MPLS)))
+				  SKB_GSO_IPIP)))
 		goto out;
 
 	/* Setup inner SKB. */
-- 
cgit v1.2.3-58-ga151


From a8f820aa4066d2c97e75ecd1bbca8a7920b66f2c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2014 21:22:22 +0800
Subject: inet: Add skb_copy_datagram_iter

This patch adds skb_copy_datagram_iter, which is identical to
skb_copy_datagram_iovec except that it operates on iov_iter
instead of iovec.

Eventually all users of skb_copy_datagram_iovec should switch
over to iov_iter and then we can remove skb_copy_datagram_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 ++
 net/core/datagram.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 53f4f6c93356..933cfce7fcd9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -150,6 +150,7 @@
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
+struct iov_iter;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -2653,6 +2654,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
 				  const struct iovec *to, int to_offset,
 				  int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a81d4c2..84d90d087a30 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/uio.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -481,6 +482,92 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
 
+/**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_to_iter(skb->data + offset, copy, to) != copy)
+			goto short_copy;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (copy_page_to_iter(skb_frag_page(frag),
+					      frag->page_offset + offset -
+					      start, copy, to) != copy)
+				goto short_copy;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iter(frag_iter, offset - start,
+						   to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+	/* This is not really a user copy fault, but rather someone
+	 * gave us a bogus length on the skb.  We should probably
+	 * print a warning here as it may indicate a kernel bug.
+	 */
+
+fault:
+	return -EFAULT;
+
+short_copy:
+	if (iov_iter_count(to))
+		goto fault;
+
+	return 0;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
 /**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
-- 
cgit v1.2.3-58-ga151


From bfe1be38fcee0e13ad53175d0b530707c20f93ec Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2014 21:22:26 +0800
Subject: net: Kill skb_copy_datagram_const_iovec

Now that both macvtap and tun are using skb_copy_datagram_iter, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 --
 net/core/datagram.c    | 89 --------------------------------------------------
 2 files changed, 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 933cfce7fcd9..103fbe8113f8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2651,9 +2651,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 84d90d087a30..26391a3fe3e5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -393,95 +393,6 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
-/**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
 /**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
-- 
cgit v1.2.3-58-ga151


From 3b47d30396bae4f0bd1ff0dbcd7c4f5077e7df4e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Nov 2014 21:09:44 -0800
Subject: net: gro: add a per device gro flush timer

Tuning coalescing parameters on NIC can be really hard.

Servers can handle both bulk and RPC like traffic, with conflicting
goals : bulk flows want as big GRO packets as possible, RPC want minimal
latencies.

To reach big GRO packets on 10Gbe NIC, one can use :

ethtool -C eth0 rx-usecs 4 rx-frames 44

But this penalizes rpc sessions, with an increase of latencies, up to
50% in some cases, as NICs generally do not force an interrupt when
a packet with TCP Push flag is received.

Some NICs do not have an absolute timer, only a timer rearmed for every
incoming packet.

This patch uses a different strategy : Let GRO stack decides what do do,
based on traffic pattern.

Packets with Push flag wont be delayed.
Packets without Push flag might be held in GRO engine, if we keep
receiving data.

This new mechanism is off by default, and shall be enabled by setting
/sys/class/net/ethX/gro_flush_timeout to a value in nanosecond.

To fully enable this mechanism, drivers should use napi_complete_done()
instead of napi_complete().

Tested:
 Ran 200 netperf TCP_STREAM from A to B (10Gbe mlx4 link, 8 RX queues)

Without this feature, we send back about 305,000 ACK per second.

GRO aggregation ratio is low (811/305 = 2.65 segments per GRO packet)

Setting a timer of 2000 nsec is enough to increase GRO packet sizes
and reduce number of ACK packets. (811/19.2 = 42)

Receiver performs less calls to upper stacks, less wakes up.
This also reduces cpu usage on the sender, as it receives less ACK
packets.

Note that reducing number of wakes up increases cpu efficiency, but can
decrease QPS, as applications wont have the chance to warmup cpu caches
doing a partial read of RPC requests/answers if they fit in one skb.

B:~# sar -n DEV 1 10 | grep eth0 | tail -1
Average:         eth0 811269.80 305732.30 1199462.57  19705.72      0.00
0.00      0.50

B:~# echo 2000 >/sys/class/net/eth0/gro_flush_timeout

B:~# sar -n DEV 1 10 | grep eth0 | tail -1
Average:         eth0 811577.30  19230.80 1199916.51   1239.80      0.00
0.00      0.50

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 ++++++++++----------
 net/core/dev.c            | 45 +++++++++++++++++++++++++++++++++++++++++----
 net/core/net-sysfs.c      | 18 ++++++++++++++++++
 3 files changed, 69 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 90ac95900a11..888d5513fa4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,6 +314,7 @@ struct napi_struct {
 	struct net_device	*dev;
 	struct sk_buff		*gro_list;
 	struct sk_buff		*skb;
+	struct hrtimer		timer;
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
@@ -443,14 +444,19 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
+void __napi_complete(struct napi_struct *n);
+void napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
  *	@n: napi context
  *
  * Mark NAPI processing as complete.
+ * Consider using napi_complete_done() instead.
  */
-void __napi_complete(struct napi_struct *n);
-void napi_complete(struct napi_struct *n);
+static inline void napi_complete(struct napi_struct *n)
+{
+	return napi_complete_done(n, 0);
+}
 
 /**
  *	napi_by_id - lookup a NAPI by napi_id
@@ -485,14 +491,7 @@ void napi_hash_del(struct napi_struct *napi);
  * Stop NAPI from being scheduled on this context.
  * Waits till any outstanding processing completes.
  */
-static inline void napi_disable(struct napi_struct *n)
-{
-	might_sleep();
-	set_bit(NAPI_STATE_DISABLE, &n->state);
-	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
-		msleep(1);
-	clear_bit(NAPI_STATE_DISABLE, &n->state);
-}
+void napi_disable(struct napi_struct *n);
 
 /**
  *	napi_enable - enable NAPI scheduling
@@ -1603,6 +1602,7 @@ struct net_device {
 
 #endif
 
+	unsigned long		gro_flush_timeout;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 70bb609c283d..bb09b0364619 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -134,6 +134,7 @@
 #include <linux/vmalloc.h>
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
+#include <linux/hrtimer.h>
 
 #include "net-sysfs.h"
 
@@ -4412,7 +4413,6 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
 void __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-	BUG_ON(n->gro_list);
 
 	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
@@ -4420,7 +4420,7 @@ void __napi_complete(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_complete);
 
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
 
@@ -4431,8 +4431,18 @@ void napi_complete(struct napi_struct *n)
 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 		return;
 
-	napi_gro_flush(n, false);
+	if (n->gro_list) {
+		unsigned long timeout = 0;
 
+		if (work_done)
+			timeout = n->dev->gro_flush_timeout;
+
+		if (timeout)
+			hrtimer_start(&n->timer, ns_to_ktime(timeout),
+				      HRTIMER_MODE_REL_PINNED);
+		else
+			napi_gro_flush(n, false);
+	}
 	if (likely(list_empty(&n->poll_list))) {
 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
 	} else {
@@ -4442,7 +4452,7 @@ void napi_complete(struct napi_struct *n)
 		local_irq_restore(flags);
 	}
 }
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
 
 /* must be called under rcu_read_lock(), as we dont take a reference */
 struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4496,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL_GPL(napi_hash_del);
 
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+	struct napi_struct *napi;
+
+	napi = container_of(timer, struct napi_struct, timer);
+	if (napi->gro_list)
+		napi_schedule(napi);
+
+	return HRTIMER_NORESTART;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
 	INIT_LIST_HEAD(&napi->poll_list);
+	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	napi->timer.function = napi_watchdog;
 	napi->gro_count = 0;
 	napi->gro_list = NULL;
 	napi->skb = NULL;
@@ -4518,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 }
 EXPORT_SYMBOL(netif_napi_add);
 
+void napi_disable(struct napi_struct *n)
+{
+	might_sleep();
+	set_bit(NAPI_STATE_DISABLE, &n->state);
+
+	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+		msleep(1);
+
+	hrtimer_cancel(&n->timer);
+
+	clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
 void netif_napi_del(struct napi_struct *napi)
 {
 	list_del_init(&napi->dev_list);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9dd06699b09c..1a24602cd54e 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -325,6 +325,23 @@ static ssize_t tx_queue_len_store(struct device *dev,
 }
 NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
 
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+	dev->gro_flush_timeout = val;
+	return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
 static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -422,6 +439,7 @@ static struct attribute *net_class_attrs[] = {
 	&dev_attr_mtu.attr,
 	&dev_attr_flags.attr,
 	&dev_attr_tx_queue_len.attr,
+	&dev_attr_gro_flush_timeout.attr,
 	&dev_attr_phys_port_id.attr,
 	NULL,
 };
-- 
cgit v1.2.3-58-ga151


From f8c6455bb04b944edb69e9b074e28efee2c56bdd Mon Sep 17 00:00:00 2001
From: Shani Michaeli <shanim@mellanox.com>
Date: Sun, 9 Nov 2014 13:51:53 +0200
Subject: net/mlx4_en: Extend checksum offloading by CHECKSUM COMPLETE

When processing received traffic, pass CHECKSUM_COMPLETE status to the
stack, with calculated checksum for non TCP/UDP packets (such
as GRE or ICMP).

Although the stack expects checksum which doesn't include the pseudo
header, the HW adds it. To address that, we are subtracting the pseudo
header checksum from the checksum value provided by the HW.

In the IPv6 case, we also compute/add the IP header checksum which
is not added by the HW for such packets.

Cc: Jerry Chu <hkchu@google.com>
Signed-off-by: Shani Michaeli <shanim@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   5 +
 drivers/net/ethernet/mellanox/mlx4/en_port.c    |   2 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      | 126 ++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/main.c       |   9 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |   5 +-
 include/linux/mlx4/device.h                     |   1 +
 7 files changed, 142 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 8ea4d5be7376..6c643230a5ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -115,7 +115,7 @@ static const char main_strings[][ETH_GSTRING_LEN] = {
 	"tso_packets",
 	"xmit_more",
 	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed",
-	"rx_csum_good", "rx_csum_none", "tx_chksum_offload",
+	"rx_csum_good", "rx_csum_none", "rx_csum_complete", "tx_chksum_offload",
 
 	/* packet statistics */
 	"broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0efbae90f1ba..d1eb25dbff56 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1893,6 +1893,7 @@ static void mlx4_en_clear_stats(struct net_device *dev)
 		priv->rx_ring[i]->packets = 0;
 		priv->rx_ring[i]->csum_ok = 0;
 		priv->rx_ring[i]->csum_none = 0;
+		priv->rx_ring[i]->csum_complete = 0;
 	}
 }
 
@@ -2503,6 +2504,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	/* Query for default mac and max mtu */
 	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
 
+	if (mdev->dev->caps.rx_checksum_flags_port[priv->port] &
+	    MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP)
+		priv->flags |= MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP;
+
 	/* Set default MAC */
 	dev->addr_len = ETH_ALEN;
 	mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
index 134b12e17da5..6cb80072af6c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -155,11 +155,13 @@ int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
 	stats->rx_bytes = 0;
 	priv->port_stats.rx_chksum_good = 0;
 	priv->port_stats.rx_chksum_none = 0;
+	priv->port_stats.rx_chksum_complete = 0;
 	for (i = 0; i < priv->rx_ring_num; i++) {
 		stats->rx_packets += priv->rx_ring[i]->packets;
 		stats->rx_bytes += priv->rx_ring[i]->bytes;
 		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
 		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
+		priv->port_stats.rx_chksum_complete += priv->rx_ring[i]->csum_complete;
 	}
 	stats->tx_packets = 0;
 	stats->tx_bytes = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index df6d352c4e54..ccd95177ea7c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -42,6 +42,10 @@
 #include <linux/vmalloc.h>
 #include <linux/irq.h>
 
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
 #include "mlx4_en.h"
 
 static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
@@ -643,6 +647,86 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 	}
 }
 
+/* When hardware doesn't strip the vlan, we need to calculate the checksum
+ * over it and add it to the hardware's checksum calculation
+ */
+static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum,
+					 struct vlan_hdr *vlanh)
+{
+	return csum_add(hw_checksum, *(__wsum *)vlanh);
+}
+
+/* Although the stack expects checksum which doesn't include the pseudo
+ * header, the HW adds it. To address that, we are subtracting the pseudo
+ * header checksum from the checksum value provided by the HW.
+ */
+static void get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
+				struct iphdr *iph)
+{
+	__u16 length_for_csum = 0;
+	__wsum csum_pseudo_header = 0;
+
+	length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2));
+	csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						length_for_csum, iph->protocol, 0);
+	skb->csum = csum_sub(hw_checksum, csum_pseudo_header);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* In IPv6 packets, besides subtracting the pseudo header checksum,
+ * we also compute/add the IP header checksum which
+ * is not added by the HW.
+ */
+static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
+			       struct ipv6hdr *ipv6h)
+{
+	__wsum csum_pseudo_hdr = 0;
+
+	if (ipv6h->nexthdr == IPPROTO_FRAGMENT || ipv6h->nexthdr == IPPROTO_HOPOPTS)
+		return -1;
+	hw_checksum = csum_add(hw_checksum, (__force __wsum)(ipv6h->nexthdr << 8));
+
+	csum_pseudo_hdr = csum_partial(&ipv6h->saddr,
+				       sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0);
+	csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len);
+	csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ntohs(ipv6h->nexthdr));
+
+	skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr);
+	skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0));
+	return 0;
+}
+#endif
+static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
+		      int hwtstamp_rx_filter)
+{
+	__wsum hw_checksum = 0;
+
+	void *hdr = (u8 *)va + sizeof(struct ethhdr);
+
+	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
+
+	if (((struct ethhdr *)va)->h_proto == htons(ETH_P_8021Q) &&
+	    hwtstamp_rx_filter != HWTSTAMP_FILTER_NONE) {
+		/* next protocol non IPv4 or IPv6 */
+		if (((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
+		    != htons(ETH_P_IP) &&
+		    ((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
+		    != htons(ETH_P_IPV6))
+			return -1;
+		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
+		hdr += sizeof(struct vlan_hdr);
+	}
+
+	if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4))
+		get_fixed_ipv4_csum(hw_checksum, skb, hdr);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
+		if (get_fixed_ipv6_csum(hw_checksum, skb, hdr))
+			return -1;
+#endif
+	return 0;
+}
+
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -744,13 +828,26 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
 
 		if (likely(dev->features & NETIF_F_RXCSUM)) {
-			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
-			    (cqe->checksum == cpu_to_be16(0xffff))) {
-				ring->csum_ok++;
-				ip_summed = CHECKSUM_UNNECESSARY;
+			if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+						      MLX4_CQE_STATUS_UDP)) {
+				if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+				    cqe->checksum == cpu_to_be16(0xffff)) {
+					ip_summed = CHECKSUM_UNNECESSARY;
+					ring->csum_ok++;
+				} else {
+					ip_summed = CHECKSUM_NONE;
+					ring->csum_none++;
+				}
 			} else {
-				ip_summed = CHECKSUM_NONE;
-				ring->csum_none++;
+				if (priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
+				    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+							       MLX4_CQE_STATUS_IPV6))) {
+					ip_summed = CHECKSUM_COMPLETE;
+					ring->csum_complete++;
+				} else {
+					ip_summed = CHECKSUM_NONE;
+					ring->csum_none++;
+				}
 			}
 		} else {
 			ip_summed = CHECKSUM_NONE;
@@ -776,6 +873,15 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			if (!nr)
 				goto next;
 
+			if (ip_summed == CHECKSUM_COMPLETE) {
+				void *va = skb_frag_address(skb_shinfo(gro_skb)->frags);
+				if (check_csum(cqe, gro_skb, va, ring->hwtstamp_rx_filter)) {
+					ip_summed = CHECKSUM_NONE;
+					ring->csum_none++;
+					ring->csum_complete--;
+				}
+			}
+
 			skb_shinfo(gro_skb)->nr_frags = nr;
 			gro_skb->len = length;
 			gro_skb->data_len = length;
@@ -822,6 +928,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			goto next;
 		}
 
+		if (ip_summed == CHECKSUM_COMPLETE) {
+			if (check_csum(cqe, skb, skb->data, ring->hwtstamp_rx_filter)) {
+				ip_summed = CHECKSUM_NONE;
+				ring->csum_complete--;
+				ring->csum_none++;
+			}
+		}
+
 		skb->ip_summed = ip_summed;
 		skb->protocol = eth_type_trans(skb, dev);
 		skb_record_rx_queue(skb, cq->ring);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 9f821964a1b9..2f6ba420ac03 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1629,6 +1629,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	struct mlx4_init_hca_param init_hca;
 	u64 icm_size;
 	int err;
+	struct mlx4_config_dev_params params;
 
 	if (!mlx4_is_slave(dev)) {
 		err = mlx4_QUERY_FW(dev);
@@ -1762,6 +1763,14 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		goto unmap_bf;
 	}
 
+	/* Query CONFIG_DEV parameters */
+	err = mlx4_config_dev_retrieval(dev, &params);
+	if (err && err != -ENOTSUPP) {
+		mlx4_err(dev, "Failed to query CONFIG_DEV parameters\n");
+	} else if (!err) {
+		dev->caps.rx_checksum_flags_port[1] = params.rx_csum_flags_port_1;
+		dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
+	}
 	priv->eq_table.inta_pin = adapter.inta_pin;
 	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ef83d127f406..de456749ffae 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -326,6 +326,7 @@ struct mlx4_en_rx_ring {
 #endif
 	unsigned long csum_ok;
 	unsigned long csum_none;
+	unsigned long csum_complete;
 	int hwtstamp_rx_filter;
 	cpumask_var_t affinity_mask;
 };
@@ -449,6 +450,7 @@ struct mlx4_en_port_stats {
 	unsigned long rx_alloc_failed;
 	unsigned long rx_chksum_good;
 	unsigned long rx_chksum_none;
+	unsigned long rx_chksum_complete;
 	unsigned long tx_chksum_offload;
 #define NUM_PORT_STATS		9
 };
@@ -507,7 +509,8 @@ enum {
 	MLX4_EN_FLAG_ENABLE_HW_LOOPBACK	= (1 << 2),
 	/* whether we need to drop packets that hardware loopback-ed */
 	MLX4_EN_FLAG_RX_FILTER_NEEDED	= (1 << 3),
-	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4)
+	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4),
+	MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP	= (1 << 5),
 };
 
 #define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 5cc5eac47d1b..3d9bff00f24a 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -497,6 +497,7 @@ struct mlx4_caps {
 	u16			hca_core_clock;
 	u64			phys_port_id[MLX4_MAX_PORTS + 1];
 	int			tunnel_offload_mode;
+	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3-58-ga151


From 09626e9d153326ca82568e4e27f2daa53713992e Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 11 Nov 2014 13:29:42 -0800
Subject: net: kill netif_copy_real_num_queues()

vlan was the only user of netif_copy_real_num_queues(),
but it no longer calls it after
commit 4af429d29b341bb1735f04c2fb960178 ("vlan: lockless transmit path").
So we can just remove it.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 888d5513fa4a..4a6f770377d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2762,23 +2762,6 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev,
 }
 #endif
 
-static inline int netif_copy_real_num_queues(struct net_device *to_dev,
-					     const struct net_device *from_dev)
-{
-	int err;
-
-	err = netif_set_real_num_tx_queues(to_dev,
-					   from_dev->real_num_tx_queues);
-	if (err)
-		return err;
-#ifdef CONFIG_SYSFS
-	return netif_set_real_num_rx_queues(to_dev,
-					    from_dev->real_num_rx_queues);
-#else
-	return 0;
-#endif
-}
-
 #ifdef CONFIG_SYSFS
 static inline unsigned int get_netdev_rx_queue_index(
 		struct netdev_rx_queue *queue)
-- 
cgit v1.2.3-58-ga151


From 85eb92e81801d64686eb78928d500a4c83ee9623 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sat, 1 Nov 2014 16:54:55 +0100
Subject: bcma: make it possible to specify a IRQ num in bcma_core_irq()

This moves bcma_core_irq() to main.c and add a extra parameter with a
number so that we can return different irq number for devices with more
than one.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/driver_chipcommon.c      |  2 +-
 drivers/bcma/driver_gpio.c            |  4 ++--
 drivers/bcma/driver_mips.c            | 11 ++---------
 drivers/bcma/driver_pci_host.c        |  4 ++--
 drivers/bcma/main.c                   | 22 ++++++++++++++++++++++
 include/linux/bcma/bcma.h             |  2 ++
 include/linux/bcma/bcma_driver_mips.h |  4 ++--
 7 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c
index b068f98920a8..19f679667ca4 100644
--- a/drivers/bcma/driver_chipcommon.c
+++ b/drivers/bcma/driver_chipcommon.c
@@ -339,7 +339,7 @@ void bcma_chipco_serial_init(struct bcma_drv_cc *cc)
 		return;
 	}
 
-	irq = bcma_core_irq(cc->core);
+	irq = bcma_core_irq(cc->core, 0);
 
 	/* Determine the registers of the UARTs */
 	cc->nr_serial_ports = (cc->capabilities & BCMA_CC_CAP_NRUART);
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index 706b9ae0dcfb..598a6cd9028a 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -152,7 +152,7 @@ static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc)
 					 handle_simple_irq);
 	}
 
-	hwirq = bcma_core_irq(cc->core);
+	hwirq = bcma_core_irq(cc->core, 0);
 	err = request_irq(hwirq, bcma_gpio_irq_handler, IRQF_SHARED, "gpio",
 			  cc);
 	if (err)
@@ -183,7 +183,7 @@ static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc)
 		return;
 
 	bcma_cc_mask32(cc, BCMA_CC_IRQMASK, ~BCMA_CC_IRQ_GPIO);
-	free_irq(bcma_core_irq(cc->core), cc);
+	free_irq(bcma_core_irq(cc->core, 0), cc);
 	for (gpio = 0; gpio < chip->ngpio; gpio++) {
 		int irq = irq_find_mapping(cc->irq_domain, gpio);
 
diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c
index 004d6aa671ce..5ec69c3d409d 100644
--- a/drivers/bcma/driver_mips.c
+++ b/drivers/bcma/driver_mips.c
@@ -115,7 +115,7 @@ static u32 bcma_core_mips_irqflag(struct bcma_device *dev)
  * If disabled, 5 is returned.
  * If not supported, 6 is returned.
  */
-static unsigned int bcma_core_mips_irq(struct bcma_device *dev)
+unsigned int bcma_core_mips_irq(struct bcma_device *dev)
 {
 	struct bcma_device *mdev = dev->bus->drv_mips.core;
 	u32 irqflag;
@@ -133,13 +133,6 @@ static unsigned int bcma_core_mips_irq(struct bcma_device *dev)
 	return 5;
 }
 
-unsigned int bcma_core_irq(struct bcma_device *dev)
-{
-	unsigned int mips_irq = bcma_core_mips_irq(dev);
-	return mips_irq <= 4 ? mips_irq + 2 : 0;
-}
-EXPORT_SYMBOL(bcma_core_irq);
-
 static void bcma_core_mips_set_irq(struct bcma_device *dev, unsigned int irq)
 {
 	unsigned int oldirq = bcma_core_mips_irq(dev);
@@ -423,7 +416,7 @@ void bcma_core_mips_init(struct bcma_drv_mips *mcore)
 		break;
 	default:
 		list_for_each_entry(core, &bus->cores, list) {
-			core->irq = bcma_core_irq(core);
+			core->irq = bcma_core_irq(core, 0);
 		}
 		bcma_err(bus,
 			 "Unknown device (0x%x) found, can not configure IRQs\n",
diff --git a/drivers/bcma/driver_pci_host.c b/drivers/bcma/driver_pci_host.c
index c3d7b03c2fdc..c8a6b741967b 100644
--- a/drivers/bcma/driver_pci_host.c
+++ b/drivers/bcma/driver_pci_host.c
@@ -593,7 +593,7 @@ int bcma_core_pci_plat_dev_init(struct pci_dev *dev)
 	pr_info("PCI: Fixing up device %s\n", pci_name(dev));
 
 	/* Fix up interrupt lines */
-	dev->irq = bcma_core_irq(pc_host->pdev->core);
+	dev->irq = bcma_core_irq(pc_host->pdev->core, 0);
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
 
 	readrq = pcie_get_readrq(dev);
@@ -617,6 +617,6 @@ int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev)
 
 	pc_host = container_of(dev->bus->ops, struct bcma_drv_pci_host,
 			       pci_ops);
-	return bcma_core_irq(pc_host->pdev->core);
+	return bcma_core_irq(pc_host->pdev->core, 0);
 }
 EXPORT_SYMBOL(bcma_core_pci_pcibios_map_irq);
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index 9b229c9c35e5..6d1cf5701452 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -169,6 +169,28 @@ static void bcma_of_fill_device(struct platform_device *parent,
 }
 #endif /* CONFIG_OF */
 
+unsigned int bcma_core_irq(struct bcma_device *core, int num)
+{
+	struct bcma_bus *bus = core->bus;
+	unsigned int mips_irq;
+
+	switch (bus->hosttype) {
+	case BCMA_HOSTTYPE_PCI:
+		return bus->host_pci->irq;
+	case BCMA_HOSTTYPE_SOC:
+		if (bus->drv_mips.core && num == 0) {
+			mips_irq = bcma_core_mips_irq(core);
+			return mips_irq <= 4 ? mips_irq + 2 : 0;
+		}
+		break;
+	case BCMA_HOSTTYPE_SDIO:
+		return 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bcma_core_irq);
+
 void bcma_prepare_core(struct bcma_bus *bus, struct bcma_device *core)
 {
 	core->dev.release = bcma_release_core_dev;
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 729f48e6b20b..eb1c6a47b67f 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -447,4 +447,6 @@ extern u32 bcma_chipco_pll_read(struct bcma_drv_cc *cc, u32 offset);
 #define  BCMA_DMA_TRANSLATION_DMA64_CMT	0x80000000 /* Client Mode Translation for 64-bit DMA */
 extern u32 bcma_core_dma_translation(struct bcma_device *core);
 
+extern unsigned int bcma_core_irq(struct bcma_device *core, int num);
+
 #endif /* LINUX_BCMA_H_ */
diff --git a/include/linux/bcma/bcma_driver_mips.h b/include/linux/bcma/bcma_driver_mips.h
index fb61f3fb4ddb..0b3b32aeeb8a 100644
--- a/include/linux/bcma/bcma_driver_mips.h
+++ b/include/linux/bcma/bcma_driver_mips.h
@@ -43,12 +43,12 @@ struct bcma_drv_mips {
 extern void bcma_core_mips_init(struct bcma_drv_mips *mcore);
 extern void bcma_core_mips_early_init(struct bcma_drv_mips *mcore);
 
-extern unsigned int bcma_core_irq(struct bcma_device *core);
+extern unsigned int bcma_core_mips_irq(struct bcma_device *dev);
 #else
 static inline void bcma_core_mips_init(struct bcma_drv_mips *mcore) { }
 static inline void bcma_core_mips_early_init(struct bcma_drv_mips *mcore) { }
 
-static inline unsigned int bcma_core_irq(struct bcma_device *core)
+static inline unsigned int bcma_core_mips_irq(struct bcma_device *dev)
 {
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From 702bf371282f5912fe53f0b247fa2d7df9d7951f Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 12 Nov 2014 03:36:57 +0100
Subject: ieee820154: add pan_id setting support

This patch adds support for setting pan_id via nl802154 framework.
Adding a comment because setting 0xffff as pan_id seems to be valid
setting. The pan_id 0xffff as source pan is invalid. I am not sure now
about this setting but for the current netlink interface this is an
invalid setting, so we do the same now. Maybe we need to change that
when we have coordinator support and association support.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h |  2 ++
 include/net/cfg802154.h    |  2 ++
 net/ieee802154/nl802154.c  | 31 +++++++++++++++++++++++++++++++
 net/ieee802154/rdev-ops.h  |  7 +++++++
 net/mac802154/cfg.c        | 20 ++++++++++++++++++++
 5 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index d043449a079d..d40379876b84 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -30,6 +30,8 @@
 #define IEEE802154_MTU			127
 #define IEEE802154_MIN_PSDU_LEN		5
 
+#define IEEE802154_PAN_ID_BROADCAST	0xffff
+
 #define IEEE802154_EXTENDED_ADDR_LEN	8
 
 #define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 391fdb37208d..d07b0726b285 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -38,6 +38,8 @@ struct cfg802154_ops {
 	void	(*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy,
 					       struct net_device *dev);
 	int	(*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel);
+	int	(*set_pan_id)(struct wpan_phy *wpan_phy,
+			      struct wpan_dev *wpan_dev, u16 pan_id);
 };
 
 struct wpan_phy {
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index d8ef2c8a182f..88cd1293283a 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -570,6 +570,29 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_channel(rdev, page, channel);
 }
 
+static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+	u16 pan_id;
+
+	/* conflict here while tx/rx calls */
+	if (netif_running(dev))
+		return -EBUSY;
+
+	/* don't change address fields on monitor */
+	if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+		return -EINVAL;
+
+	if (!info->attrs[NL802154_ATTR_PAN_ID])
+		return -EINVAL;
+
+	pan_id = nla_get_u16(info->attrs[NL802154_ATTR_PAN_ID]);
+
+	return rdev_set_pan_id(rdev, wpan_dev, pan_id);
+}
+
 #define NL802154_FLAG_NEED_WPAN_PHY	0x01
 #define NL802154_FLAG_NEED_NETDEV	0x02
 #define NL802154_FLAG_NEED_RTNL		0x04
@@ -688,6 +711,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_PAN_ID,
+		.doit = nl802154_set_pan_id,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_NETDEV |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 };
 
 /* initialisation/exit functions */
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 8a3b0eb4e026..4115ea264fd5 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -27,4 +27,11 @@ rdev_set_channel(struct cfg802154_registered_device *rdev, const u8 page,
 	return rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
 }
 
+static inline int
+rdev_set_pan_id(struct cfg802154_registered_device *rdev,
+		struct wpan_dev *wpan_dev, u16 pan_id)
+{
+	return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+}
+
 #endif /* __CFG802154_RDEV_OPS */
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 9d5b1895c752..db6e5e981a83 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -64,8 +64,28 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, const u8 page,
 	return ret;
 }
 
+static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy,
+				 struct wpan_dev *wpan_dev, const u16 pan_id)
+{
+	ASSERT_RTNL();
+
+	/* TODO
+	 * I am not sure about to check here on broadcast pan_id.
+	 * Broadcast is a valid setting, comment from 802.15.4:
+	 * If this value is 0xffff, the device is not associated.
+	 *
+	 * This could useful to simple deassociate an device.
+	 */
+	if (pan_id == IEEE802154_PAN_ID_BROADCAST)
+		return -EINVAL;
+
+	wpan_dev->pan_id = cpu_to_le16(pan_id);
+	return 0;
+}
+
 const struct cfg802154_ops mac802154_config_ops = {
 	.add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
 	.del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
 	.set_channel = ieee802154_set_channel,
+	.set_pan_id = ieee802154_set_pan_id,
 };
-- 
cgit v1.2.3-58-ga151


From 9830c62a0b3d57d9d00880989cfe987f581bc03f Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 12 Nov 2014 03:36:58 +0100
Subject: ieee820154: add short_addr setting support

This patch adds support for setting short address via nl802154 framework.
Also added a comment because a 0xffff seems to be valid address that we
don't have a short address. This is a valid setting but we need
more checks in upper layers to don't allow this address as source address.
Also the current netlink interface doesn't allow to set the short_addr
to 0xffff. Same for the 0xfffe short address which describes a not
allocated short address.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h |  2 ++
 include/net/cfg802154.h    |  2 ++
 net/ieee802154/nl802154.c  | 31 +++++++++++++++++++++++++++++++
 net/ieee802154/rdev-ops.h  |  7 +++++++
 net/mac802154/cfg.c        | 26 ++++++++++++++++++++++++++
 5 files changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index d40379876b84..ce0f96a55976 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -31,6 +31,8 @@
 #define IEEE802154_MIN_PSDU_LEN		5
 
 #define IEEE802154_PAN_ID_BROADCAST	0xffff
+#define IEEE802154_ADDR_SHORT_BROADCAST	0xffff
+#define IEEE802154_ADDR_SHORT_UNSPEC	0xfffe
 
 #define IEEE802154_EXTENDED_ADDR_LEN	8
 
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index d07b0726b285..e8a4c2b70720 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -40,6 +40,8 @@ struct cfg802154_ops {
 	int	(*set_channel)(struct wpan_phy *wpan_phy, u8 page, u8 channel);
 	int	(*set_pan_id)(struct wpan_phy *wpan_phy,
 			      struct wpan_dev *wpan_dev, u16 pan_id);
+	int	(*set_short_addr)(struct wpan_phy *wpan_phy,
+				  struct wpan_dev *wpan_dev, u16 short_addr);
 };
 
 struct wpan_phy {
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 88cd1293283a..2978c1a78017 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -593,6 +593,29 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
 	return rdev_set_pan_id(rdev, wpan_dev, pan_id);
 }
 
+static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg802154_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+	u16 short_addr;
+
+	/* conflict here while tx/rx calls */
+	if (netif_running(dev))
+		return -EBUSY;
+
+	/* don't change address fields on monitor */
+	if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+		return -EINVAL;
+
+	if (!info->attrs[NL802154_ATTR_SHORT_ADDR])
+		return -EINVAL;
+
+	short_addr = nla_get_u16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+
+	return rdev_set_short_addr(rdev, wpan_dev, short_addr);
+}
+
 #define NL802154_FLAG_NEED_WPAN_PHY	0x01
 #define NL802154_FLAG_NEED_NETDEV	0x02
 #define NL802154_FLAG_NEED_RTNL		0x04
@@ -719,6 +742,14 @@ static const struct genl_ops nl802154_ops[] = {
 		.internal_flags = NL802154_FLAG_NEED_NETDEV |
 				  NL802154_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL802154_CMD_SET_SHORT_ADDR,
+		.doit = nl802154_set_short_addr,
+		.policy = nl802154_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL802154_FLAG_NEED_NETDEV |
+				  NL802154_FLAG_NEED_RTNL,
+	},
 };
 
 /* initialisation/exit functions */
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 4115ea264fd5..16b0de06c3af 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -34,4 +34,11 @@ rdev_set_pan_id(struct cfg802154_registered_device *rdev,
 	return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
 }
 
+static inline int
+rdev_set_short_addr(struct cfg802154_registered_device *rdev,
+		    struct wpan_dev *wpan_dev, u16 short_addr)
+{
+	return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+}
+
 #endif /* __CFG802154_RDEV_OPS */
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index db6e5e981a83..df29976d1321 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -83,9 +83,35 @@ static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy,
 	return 0;
 }
 
+static int
+ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+			  const u16 short_addr)
+{
+	ASSERT_RTNL();
+
+	/* TODO
+	 * I am not sure about to check here on broadcast short_addr.
+	 * Broadcast is a valid setting, comment from 802.15.4:
+	 * A value of 0xfffe indicates that the device has
+	 * associated but has not been allocated an address. A
+	 * value of 0xffff indicates that the device does not
+	 * have a short address.
+	 *
+	 * I think we should allow to set these settings but
+	 * don't allow to allow socket communication with it.
+	 */
+	if (short_addr == IEEE802154_ADDR_SHORT_UNSPEC ||
+	    short_addr == IEEE802154_ADDR_SHORT_BROADCAST)
+		return -EINVAL;
+
+	wpan_dev->short_addr = cpu_to_le16(short_addr);
+	return 0;
+}
+
 const struct cfg802154_ops mac802154_config_ops = {
 	.add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
 	.del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
 	.set_channel = ieee802154_set_channel,
 	.set_pan_id = ieee802154_set_pan_id,
+	.set_short_addr = ieee802154_set_short_addr,
 };
-- 
cgit v1.2.3-58-ga151


From 71dfda58aaaf4bf6b1bc59f9d8afa635fa1337d4 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 11 Nov 2014 09:26:34 -0800
Subject: net: Add device Rx page allocation function

This patch implements __dev_alloc_pages and __dev_alloc_page.  These are
meant to replace the __skb_alloc_pages and __skb_alloc_page functions.  The
reason for doing this is that it occurred to me that __skb_alloc_page is
supposed to be passed an sk_buff pointer, but it is NULL in all cases where
it is used.  Worse is that in the case of ixgbe it is passed NULL via the
sk_buff pointer in the rx_buffer info structure which means the compiler is
not correctly stripping it out.

The naming for these functions is based on dev_alloc_skb and __dev_alloc_skb.
There was originally a netdev_alloc_page, however that was passed a
net_device pointer and this function is not so I thought it best to follow
that naming scheme since that is the same difference between dev_alloc_skb
and netdev_alloc_skb.

In the case of anything greater than order 0 it is assumed that we want a
compound page so __GFP_COMP is set for all allocations as we expect a
compound page when assigning a page frag.

The other change in this patch is to exploit the behaviors of the page
allocator in how it handles flags.  So for example we can always set
__GFP_COMP and __GFP_MEMALLOC since they are ignored if they are not
applicable or are overridden by another flag.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 103fbe8113f8..2e5221f1ec72 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2184,6 +2184,54 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
 }
 
+/**
+ * __dev_alloc_pages - allocate page for network Rx
+ * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
+ * @order: size of the allocation
+ *
+ * Allocate a new page.
+ *
+ * %NULL is returned if there is no free memory.
+*/
+static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
+					     unsigned int order)
+{
+	/* This piece of code contains several assumptions.
+	 * 1.  This is for device Rx, therefor a cold page is preferred.
+	 * 2.  The expectation is the user wants a compound page.
+	 * 3.  If requesting a order 0 page it will not be compound
+	 *     due to the check to see if order has a value in prep_new_page
+	 * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
+	 *     code in gfp_to_alloc_flags that should be enforcing this.
+	 */
+	gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC;
+
+	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+}
+
+static inline struct page *dev_alloc_pages(unsigned int order)
+{
+	return __dev_alloc_pages(GFP_ATOMIC, order);
+}
+
+/**
+ * __dev_alloc_page - allocate a page for network Rx
+ * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
+ *
+ * Allocate a new page.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
+{
+	return __dev_alloc_pages(gfp_mask, 0);
+}
+
+static inline struct page *dev_alloc_page(void)
+{
+	return __dev_alloc_page(GFP_ATOMIC);
+}
+
 /**
  *	__skb_alloc_pages - allocate pages for ps-rx on a skb and preserve pfmemalloc data
  *	@gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
-- 
cgit v1.2.3-58-ga151


From 160d2aba550b23c6a538158511d5adccc400f04c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 11 Nov 2014 09:27:05 -0800
Subject: net: Remove __skb_alloc_page and __skb_alloc_pages

Remove the two functions which are now dead code.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2e5221f1ec72..73c370e615de 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2232,49 +2232,6 @@ static inline struct page *dev_alloc_page(void)
 	return __dev_alloc_page(GFP_ATOMIC);
 }
 
-/**
- *	__skb_alloc_pages - allocate pages for ps-rx on a skb and preserve pfmemalloc data
- *	@gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
- *	@skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
- *	@order: size of the allocation
- *
- * 	Allocate a new page.
- *
- * 	%NULL is returned if there is no free memory.
-*/
-static inline struct page *__skb_alloc_pages(gfp_t gfp_mask,
-					      struct sk_buff *skb,
-					      unsigned int order)
-{
-	struct page *page;
-
-	gfp_mask |= __GFP_COLD;
-
-	if (!(gfp_mask & __GFP_NOMEMALLOC))
-		gfp_mask |= __GFP_MEMALLOC;
-
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
-	if (skb && page && page->pfmemalloc)
-		skb->pfmemalloc = true;
-
-	return page;
-}
-
-/**
- *	__skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data
- *	@gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
- *	@skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
- *
- * 	Allocate a new page.
- *
- * 	%NULL is returned if there is no free memory.
- */
-static inline struct page *__skb_alloc_page(gfp_t gfp_mask,
-					     struct sk_buff *skb)
-{
-	return __skb_alloc_pages(gfp_mask, skb, 0);
-}
-
 /**
  *	skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
  *	@page: The page that was allocated from skb_alloc_page
-- 
cgit v1.2.3-58-ga151


From c31accd159a6477b91de61ae237dce38e3f3ee4d Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 11 Nov 2014 19:45:57 +0100
Subject: net: phy: add module_phy_driver macro

Add helper macro for PHY drivers which do not do anything special in
module init/exit. This will allow us to eliminate a lot of boilerplate
code.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d090cfcaa167..07794e720139 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -772,4 +772,28 @@ int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
 
 extern struct bus_type mdio_bus_type;
+
+/**
+ * module_phy_driver() - Helper macro for registering PHY drivers
+ * @__phy_drivers: array of PHY drivers to register
+ *
+ * Helper macro for PHY drivers which do not do anything special in module
+ * init/exit. Each module may only use this macro once, and calling it
+ * replaces module_init() and module_exit().
+ */
+#define phy_module_driver(__phy_drivers, __count)			\
+static int __init phy_module_init(void)					\
+{									\
+	return phy_drivers_register(__phy_drivers, __count);		\
+}									\
+module_init(phy_module_init);						\
+static void __exit phy_module_exit(void)				\
+{									\
+	phy_drivers_unregister(__phy_drivers, __count);			\
+}									\
+module_exit(phy_module_exit)
+
+#define module_phy_driver(__phy_drivers)				\
+	phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers))
+
 #endif /* __PHY_H */
-- 
cgit v1.2.3-58-ga151


From 61f2dcba9a03d4fd9342f0d6821af0a46c7098e9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Wed, 12 Nov 2014 19:51:56 +0100
Subject: mac802154: add interframe spacing time handling

This patch adds a new interframe spacing time handling into mac802154
layer. Interframe spacing time is a time period between each transmit.
This patch adds a high resolution timer into mac802154 and starts on
xmit complete with corresponding interframe spacing expire time if
ifs_handling is true. We make it variable because it depends if
interframe spacing time is handled by transceiver or mac802154. At the
timer complete function we wake the netdev queue again. This avoids
new frame transmit in range of interframe spacing time.

For synced driver we add no handling of interframe spacing time. This
is currently a lack of support in all synced xmit drivers. I suppose
it's working because the latency of workqueue which is needed to call
spi_sync.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/at86rf230.c |  2 +-
 include/linux/ieee802154.h         |  3 +++
 include/net/cfg802154.h            |  8 ++++++++
 include/net/mac802154.h            |  3 ++-
 net/mac802154/ieee802154_i.h       |  4 ++++
 net/mac802154/iface.c              |  2 ++
 net/mac802154/main.c               | 17 +++++++++++++++++
 net/mac802154/tx.c                 |  2 +-
 net/mac802154/util.c               | 32 +++++++++++++++++++++++++++++---
 9 files changed, 67 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 31d62f9c6ce8..46e50295710a 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -731,7 +731,7 @@ at86rf230_tx_complete(void *context)
 			udelay(lp->data->t_sifs);
 	}
 
-	ieee802154_xmit_complete(lp->hw, skb);
+	ieee802154_xmit_complete(lp->hw, skb, false);
 }
 
 static void
diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index ce0f96a55976..5a40c0418438 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -36,6 +36,9 @@
 
 #define IEEE802154_EXTENDED_ADDR_LEN	8
 
+#define IEEE802154_LIFS_PERIOD		40
+#define IEEE802154_SIFS_PERIOD		12
+
 #define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
 #define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
 #define IEEE802154_FC_TYPE_ACK		0x2	/* Frame is acknowledgment */
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index fa0a9e519523..17b4fc0705b2 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -81,6 +81,14 @@ struct wpan_phy {
 
 	s32 cca_ed_level;
 
+	/* PHY depended MAC PIB values */
+
+	/* 802.15.4 acronym: Tdsym in usec */
+	u8 symbol_duration;
+	/* lifs and sifs periods timing */
+	u16 lifs_period;
+	u16 sifs_period;
+
 	struct device dev;
 
 	char priv[0] __aligned(NETDEV_ALIGN);
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 632f6566adb5..c823d910b46c 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -260,6 +260,7 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb,
 
 void ieee802154_wake_queue(struct ieee802154_hw *hw);
 void ieee802154_stop_queue(struct ieee802154_hw *hw);
-void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb);
+void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
+			      bool ifs_handling);
 
 #endif /* NET_MAC802154_H */
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 69cb585e162f..c5b231047b60 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -20,6 +20,7 @@
 #define __IEEE802154_I_H
 
 #include <linux/mutex.h>
+#include <linux/hrtimer.h>
 #include <net/cfg802154.h>
 #include <net/mac802154.h>
 #include <net/ieee802154_netdev.h>
@@ -51,6 +52,8 @@ struct ieee802154_local {
 	 */
 	struct workqueue_struct	*workqueue;
 
+	struct hrtimer ifs_timer;
+
 	bool started;
 
 	struct tasklet_struct tasklet;
@@ -127,6 +130,7 @@ ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
 void mac802154_wpan_setup(struct net_device *dev);
 netdev_tx_t
 ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
+enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
 
 /* MIB callbacks */
 void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val);
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index ec92b48d1b0b..feb064715d1f 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -246,6 +246,8 @@ static int mac802154_slave_close(struct net_device *dev)
 
 	ASSERT_RTNL();
 
+	hrtimer_cancel(&local->ifs_timer);
+
 	netif_stop_queue(dev);
 	local->open_count--;
 
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 46c76e005446..0af1be64e8ad 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -125,6 +125,18 @@ void ieee802154_free_hw(struct ieee802154_hw *hw)
 }
 EXPORT_SYMBOL(ieee802154_free_hw);
 
+static void ieee802154_setup_wpan_phy_pib(struct wpan_phy *wpan_phy)
+{
+	/* TODO warn on empty symbol_duration
+	 * Should be done when all drivers sets this value.
+	 */
+
+	wpan_phy->lifs_period = IEEE802154_LIFS_PERIOD *
+				wpan_phy->symbol_duration;
+	wpan_phy->sifs_period = IEEE802154_SIFS_PERIOD *
+				wpan_phy->symbol_duration;
+}
+
 int ieee802154_register_hw(struct ieee802154_hw *hw)
 {
 	struct ieee802154_local *local = hw_to_local(hw);
@@ -138,8 +150,13 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
 		goto out;
 	}
 
+	hrtimer_init(&local->ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	local->ifs_timer.function = ieee802154_xmit_ifs_timer;
+
 	wpan_phy_set_dev(local->phy, local->hw.parent);
 
+	ieee802154_setup_wpan_phy_pib(local->phy);
+
 	rc = wpan_phy_register(local->phy);
 	if (rc < 0)
 		goto out_wq;
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index cc37b77f2632..c62e95695c78 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -60,7 +60,7 @@ static void ieee802154_xmit_worker(struct work_struct *work)
 	if (res)
 		goto err_tx;
 
-	ieee802154_xmit_complete(&local->hw, skb);
+	ieee802154_xmit_complete(&local->hw, skb, false);
 
 	dev->stats.tx_packets++;
 	dev->stats.tx_bytes += skb->len;
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 9a04e4a8e50f..5fc979027919 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -50,9 +50,35 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw)
 }
 EXPORT_SYMBOL(ieee802154_stop_queue);
 
-void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb)
+enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer)
 {
-	ieee802154_wake_queue(hw);
-	consume_skb(skb);
+	struct ieee802154_local *local =
+		container_of(timer, struct ieee802154_local, ifs_timer);
+
+	ieee802154_wake_queue(&local->hw);
+
+	return HRTIMER_NORESTART;
+}
+
+void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
+			      bool ifs_handling)
+{
+	if (ifs_handling) {
+		struct ieee802154_local *local = hw_to_local(hw);
+
+		if (skb->len > 18)
+			hrtimer_start(&local->ifs_timer,
+				      ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC),
+				      HRTIMER_MODE_REL);
+		else
+			hrtimer_start(&local->ifs_timer,
+				      ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
+				      HRTIMER_MODE_REL);
+
+		consume_skb(skb);
+	} else {
+		ieee802154_wake_queue(hw);
+		consume_skb(skb);
+	}
 }
 EXPORT_SYMBOL(ieee802154_xmit_complete);
-- 
cgit v1.2.3-58-ga151


From 1b2f309d70daf04b6a97b3753e375654532f6207 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Nov 2014 18:11:20 +0800
Subject: rhashtable: Move mutex_is_held under PROVE_LOCKING

The rhashtable function mutex_is_held is only used when PROVE_LOCKING
is enabled.  This patch makes the mutex_is_held field in rhashtable
optional depending on PROVE_LOCKING.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 2 ++
 lib/rhashtable.c           | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index fb298e9d6d3a..96ce8ceff554 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -65,7 +65,9 @@ struct rhashtable_params {
 						 size_t new_size);
 	bool			(*shrink_decision)(const struct rhashtable *ht,
 						   size_t new_size);
+#ifdef CONFIG_PROVE_LOCKING
 	int			(*mutex_is_held)(void);
+#endif
 };
 
 /**
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 081be3ba9ea8..c7654b6f5f64 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -532,7 +532,9 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = arch_fast_hash,
+ * #ifdef CONFIG_PROVE_LOCKING
  *	.mutex_is_held = &my_mutex_is_held,
+ * #endif
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -552,7 +554,9 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.head_offset = offsetof(struct test_obj, node),
  *	.hashfn = arch_fast_hash,
  *	.obj_hashfn = my_hash_fn,
+ * #ifdef CONFIG_PROVE_LOCKING
  *	.mutex_is_held = &my_mutex_is_held,
+ * #endif
  * };
  */
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -613,10 +617,12 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_PTR	((void *) 0xdeadbeef)
 #define TEST_NEXPANDS	4
 
+#ifdef CONFIG_PROVE_LOCKING
 static int test_mutex_is_held(void)
 {
 	return 1;
 }
+#endif
 
 struct test_obj {
 	void			*ptr;
@@ -767,7 +773,9 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = arch_fast_hash,
+#ifdef CONFIG_PROVE_LOCKING
 		.mutex_is_held = &test_mutex_is_held,
+#endif
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
-- 
cgit v1.2.3-58-ga151


From 7b4ce2353467fdab6e003be7a3129fb09b09deac Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Nov 2014 18:11:22 +0800
Subject: rhashtable: Add parent argument to mutex_is_held

Currently mutex_is_held can only test locks in the that are global
since it takes no arguments.  This prevents rhashtable from being
used in places where locks are lock, e.g., per-namespace locks.

This patch adds a parent field to mutex_is_held and rhashtable_params
so that local locks can be used (and tested).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 3 ++-
 lib/rhashtable.c           | 4 ++--
 net/netfilter/nft_hash.c   | 2 +-
 net/netlink/af_netlink.c   | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 96ce8ceff554..473e26bdb91d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -66,7 +66,8 @@ struct rhashtable_params {
 	bool			(*shrink_decision)(const struct rhashtable *ht,
 						   size_t new_size);
 #ifdef CONFIG_PROVE_LOCKING
-	int			(*mutex_is_held)(void);
+	int			(*mutex_is_held)(void *parent);
+	void			*parent;
 #endif
 };
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c7654b6f5f64..4b4b53bfa08b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,7 +32,7 @@
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 {
-	return ht->p.mutex_is_held();
+	return ht->p.mutex_is_held(ht->p.parent);
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 #endif
@@ -618,7 +618,7 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_NEXPANDS	4
 
 #ifdef CONFIG_PROVE_LOCKING
-static int test_mutex_is_held(void)
+static int test_mutex_is_held(void *parent)
 {
 	return 1;
 }
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index b86305c86048..3f75aaaf9d06 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -154,7 +154,7 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
 }
 
 #ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nfnl_lock_is_held(void)
+static int lockdep_nfnl_lock_is_held(void *parent)
 {
 	return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 53b8ea793191..9e0628cfdf67 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -115,7 +115,7 @@ DEFINE_MUTEX(nl_sk_hash_lock);
 EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
 
 #ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nl_sk_hash_is_held(void)
+static int lockdep_nl_sk_hash_is_held(void *parent)
 {
 	if (debug_locks)
 		return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
-- 
cgit v1.2.3-58-ga151


From 7ae0e400cd9396c41fe596d35dcc34feaa89a04f Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 13 Nov 2014 14:45:32 +0200
Subject: net/mlx4_core: Flexible (asymmetric) allocation of EQs and MSI-X
 vectors for PF/VFs

Previously, the driver queried the firmware in order to get the number
of supported EQs. Under SRIOV, since this was done before the driver
notified the firmware how many VFs it actually needs, the firmware had
to take into account a worst case scenario and always allocated four EQs
per VF, where one was used for events while the others were used for completions.

Now, when the firmware supports the asymmetric allocation scheme, denoted
by exposing num_sys_eqs > 0 (--> MLX4_DEV_CAP_FLAG2_SYS_EQS), we use the
QUERY_FUNC command to query the firmware before enabling SRIOV. Thus we
can get more EQs and MSI-X vectors per function.

Moreover, when running in the new firmware/driver mode, the limitation
that the number of EQs should be a power of two is lifted.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c            |   3 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c      |   8 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c      |  52 +++++++---
 drivers/net/ethernet/mellanox/mlx4/fw.h      |   2 +
 drivers/net/ethernet/mellanox/mlx4/main.c    | 144 +++++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/profile.c |  19 ++--
 include/linux/mlx4/device.h                  |   4 +-
 7 files changed, 190 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 8b72cf392b34..0c3375524a64 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1975,8 +1975,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 	    dev->caps.num_ports > dev->caps.comp_pool)
 		return;
 
-	eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
-					dev->caps.num_ports);
+	eq_per_port = dev->caps.comp_pool / dev->caps.num_ports;
 
 	/* Init eq table */
 	added_eqs = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 49290a405903..d68b264cee4d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -1123,8 +1123,12 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 		goto err_out_free;
 	}
 
-	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
-			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap,
+			       roundup_pow_of_two(dev->caps.num_eqs),
+			       dev->caps.num_eqs - 1,
+			       dev->caps.reserved_eqs,
+			       roundup_pow_of_two(dev->caps.num_eqs) -
+			       dev->caps.num_eqs);
 	if (err)
 		goto err_out_free;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index b3bbeb97da14..d2f594fadfbf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -142,7 +142,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[13] = "Large cache line (>64B) EQE stride support",
 		[14] = "Ethernet protocol control support",
 		[15] = "Ethernet Backplane autoneg support",
-		[16] = "CONFIG DEV support"
+		[16] = "CONFIG DEV support",
+		[17] = "Asymmetric EQs support"
 	};
 	int i;
 
@@ -200,7 +201,6 @@ int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
 	outbox = mailbox->buf;
 
 	in_modifier = slave;
-	mlx4_dbg(dev, "%s for VF %d\n", __func__, in_modifier);
 
 	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, 0,
 			   MLX4_CMD_QUERY_FUNC,
@@ -243,6 +243,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	u8	field, port;
 	u32	size, proxy_qp, qkey;
 	int	err = 0;
+	struct mlx4_func func;
 
 #define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
 #define QUERY_FUNC_CAP_NUM_PORTS_OFFSET		0x1
@@ -287,6 +288,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
+#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
 
 	if (vhcr->op_modifier == 1) {
 		struct mlx4_active_ports actv_ports =
@@ -365,11 +367,24 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		size = dev->caps.num_cqs;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
 
-		size = dev->caps.num_eqs;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
-
-		size = dev->caps.reserved_eqs;
-		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) ||
+		    mlx4_QUERY_FUNC(dev, &func, slave)) {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				dev->caps.num_eqs :
+				rounddown_pow_of_two(dev->caps.num_eqs);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = dev->caps.reserved_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		} else {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				func.max_eq :
+				rounddown_pow_of_two(func.max_eq);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = func.rsvd_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		}
 
 		size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
@@ -399,14 +414,17 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 	u8			field, op_modifier;
 	u32			size, qkey;
 	int			err = 0, quotas = 0;
+	u32                     in_modifier;
 
 	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
+	in_modifier = op_modifier ? gen_or_port :
+		QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
-	err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier,
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, op_modifier,
 			   MLX4_CMD_QUERY_FUNC_CAP,
 			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 	if (err)
@@ -578,6 +596,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
 #define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
 #define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET		0x26
 #define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
 #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
@@ -678,6 +697,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->reserved_mrws = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
 	dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET);
+	dev_cap->num_sys_eqs = size & 0xfff;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
 	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
@@ -905,8 +926,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	 * we can't use any EQs whose doorbell falls on that page,
 	 * even if the EQ itself isn't reserved.
 	 */
-	dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
-				    dev_cap->reserved_eqs);
+	if (dev_cap->num_sys_eqs == 0)
+		dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+					    dev_cap->reserved_eqs);
+	else
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
 
 	mlx4_dbg(dev, "Max ICM size %lld MB\n",
 		 (unsigned long long) dev_cap->max_icm_sz >> 20);
@@ -916,8 +940,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
 	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
 		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
-	mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
-		 dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
+		 dev_cap->eqc_entry_sz);
 	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
 		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
 	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
@@ -1463,6 +1488,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
 #define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
 #define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	INIT_HCA_NUM_SYS_EQS_OFFSET	(INIT_HCA_QPC_OFFSET + 0x6a)
 #define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
 #define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
 #define INIT_HCA_MCAST_OFFSET		 0x0c0
@@ -1566,6 +1592,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->num_sys_eqs,   INIT_HCA_NUM_SYS_EQS_OFFSET);
 	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
 	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
 
@@ -1676,6 +1703,7 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	MLX4_GET(param->auxc_base,     outbox, INIT_HCA_AUXC_BASE_OFFSET);
 	MLX4_GET(param->eqc_base,      outbox, INIT_HCA_EQC_BASE_OFFSET);
 	MLX4_GET(param->log_num_eqs,   outbox, INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_GET(param->num_sys_eqs,   outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
 	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
 	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 48c11b5e73e7..475215ee370f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -56,6 +56,7 @@ struct mlx4_dev_cap {
 	int max_mpts;
 	int reserved_eqs;
 	int max_eqs;
+	int num_sys_eqs;
 	int reserved_mtts;
 	int max_mrw_sz;
 	int reserved_mrws;
@@ -180,6 +181,7 @@ struct mlx4_init_hca_param {
 	u8  log_num_srqs;
 	u8  log_num_cqs;
 	u8  log_num_eqs;
+	u16 num_sys_eqs;
 	u8  log_rd_per_qp;
 	u8  log_mc_table_sz;
 	u8  log_mpt_sz;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 43047b2a2aac..ebb279060a25 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -197,6 +197,29 @@ static void mlx4_set_port_mask(struct mlx4_dev *dev)
 		dev->caps.port_mask[i] = dev->caps.port_type[i];
 }
 
+enum {
+	MLX4_QUERY_FUNC_NUM_SYS_EQS = 1 << 0,
+};
+
+static int mlx4_query_func(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err = 0;
+	struct mlx4_func func;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+		err = mlx4_QUERY_FUNC(dev, &func, 0);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+			return err;
+		}
+		dev_cap->max_eqs = func.max_eq;
+		dev_cap->reserved_eqs = func.rsvd_eqs;
+		dev_cap->reserved_uars = func.rsvd_uars;
+		err |= MLX4_QUERY_FUNC_NUM_SYS_EQS;
+	}
+	return err;
+}
+
 static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
 {
 	struct mlx4_caps *dev_cap = &dev->caps;
@@ -261,7 +284,10 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	}
 
 	dev->caps.num_ports	     = dev_cap->num_ports;
-	dev->phys_caps.num_phys_eqs  = MLX4_MAX_EQ_NUM;
+	dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
+	dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
+				      dev->caps.num_sys_eqs :
+				      MLX4_MAX_EQ_NUM;
 	for (i = 1; i <= dev->caps.num_ports; ++i) {
 		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
 		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
@@ -1130,8 +1156,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
 	if (err)
 		goto err_srq;
 
-	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
-		  dev->caps.num_eqs;
+	num_eqs = dev->phys_caps.num_phys_eqs;
 	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
 				  cmpt_base +
 				  ((u64) (MLX4_CMPT_TYPE_EQ *
@@ -1193,8 +1218,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
 	}
 
 
-	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
-		   dev->caps.num_eqs;
+	num_eqs = dev->phys_caps.num_phys_eqs;
 	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
 				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
 				  num_eqs, num_eqs, 0, 0);
@@ -1719,6 +1743,19 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 			mlx4_err(dev, "INIT_HCA command failed, aborting\n");
 			goto err_free_icm;
 		}
+
+		if (dev_cap.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+			err = mlx4_query_func(dev, &dev_cap);
+			if (err < 0) {
+				mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
+				goto err_stop_fw;
+			} else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
+				dev->caps.num_eqs = dev_cap.max_eqs;
+				dev->caps.reserved_eqs = dev_cap.reserved_eqs;
+				dev->caps.reserved_uars = dev_cap.reserved_uars;
+			}
+		}
+
 		/*
 		 * If TS is supported by FW
 		 * read HCA frequency by QUERY_HCA command
@@ -2085,12 +2122,11 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry *entries;
-	int nreq = min_t(int, dev->caps.num_ports *
-			 min_t(int, num_online_cpus() + 1,
-			       MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX);
 	int i;
 
 	if (msi_x) {
+		int nreq = dev->caps.num_ports * num_online_cpus() + MSIX_LEGACY_SZ;
+
 		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
 			     nreq);
 
@@ -2345,6 +2381,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 	int err;
 	int port;
 	int i;
+	struct mlx4_dev_cap *dev_cap = NULL;
 	int existing_vfs = 0;
 
 	dev = &priv->dev;
@@ -2381,15 +2418,6 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 			}
 		}
 
-		if (total_vfs) {
-			existing_vfs = pci_num_vf(pdev);
-			dev->flags = MLX4_FLAG_MASTER;
-			dev->flags = mlx4_enable_sriov(dev, pdev, total_vfs,
-						       existing_vfs);
-			if (!SRIOV_VALID_STATE(dev->flags))
-				goto err_sriov;
-		}
-
 		atomic_set(&priv->opreq_count, 0);
 		INIT_WORK(&priv->opreq_task, mlx4_opreq_action);
 
@@ -2403,6 +2431,12 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 			mlx4_err(dev, "Failed to reset HCA, aborting\n");
 			goto err_sriov;
 		}
+
+		if (total_vfs) {
+			existing_vfs = pci_num_vf(pdev);
+			dev->flags = MLX4_FLAG_MASTER;
+			dev->num_vfs = total_vfs;
+		}
 	}
 
 slave_start:
@@ -2416,9 +2450,10 @@ slave_start:
 	 * before posting commands. Also, init num_slaves before calling
 	 * mlx4_init_hca */
 	if (mlx4_is_mfunc(dev)) {
-		if (mlx4_is_master(dev))
+		if (mlx4_is_master(dev)) {
 			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
-		else {
+
+		} else {
 			dev->num_slaves = 0;
 			err = mlx4_multi_func_init(dev);
 			if (err) {
@@ -2434,6 +2469,52 @@ slave_start:
 		goto err_mfunc;
 	}
 
+	if (mlx4_is_master(dev)) {
+		if (!dev_cap) {
+			dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+
+			if (!dev_cap) {
+				err = -ENOMEM;
+				goto err_fw;
+			}
+
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+								  existing_vfs);
+
+				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+				dev->flags = dev_flags;
+				if (!SRIOV_VALID_STATE(dev->flags)) {
+					mlx4_err(dev, "Invalid SRIOV state\n");
+					goto err_sriov;
+				}
+				err = mlx4_reset(dev);
+				if (err) {
+					mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+					goto err_sriov;
+				}
+				goto slave_start;
+			}
+		} else {
+			/* Legacy mode FW requires SRIOV to be enabled before
+			 * doing QUERY_DEV_CAP, since max_eq's value is different if
+			 * SRIOV is enabled.
+			 */
+			memset(dev_cap, 0, sizeof(*dev_cap));
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+		}
+	}
+
 	err = mlx4_init_hca(dev);
 	if (err) {
 		if (err == -EACCES) {
@@ -2457,6 +2538,30 @@ slave_start:
 			goto err_fw;
 	}
 
+	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs, existing_vfs);
+
+		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
+			dev->flags = dev_flags;
+			err = mlx4_cmd_init(dev);
+			if (err) {
+				/* Only VHCR is cleaned up, so could still
+				 * send FW commands
+				 */
+				mlx4_err(dev, "Failed to init VHCR command interface, aborting\n");
+				goto err_close;
+			}
+		} else {
+			dev->flags = dev_flags;
+		}
+
+		if (!SRIOV_VALID_STATE(dev->flags)) {
+			mlx4_err(dev, "Invalid SRIOV state\n");
+			goto err_close;
+		}
+	}
+
 	/* check if the device is functioning at its maximum possible speed.
 	 * No return code for this call, just warn the user in case of PCI
 	 * express device capabilities are under-satisfied by the bus.
@@ -2631,6 +2736,7 @@ err_sriov:
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
 
+	kfree(dev_cap);
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c
index 14089d9e1667..2bf437aafc53 100644
--- a/drivers/net/ethernet/mellanox/mlx4/profile.c
+++ b/drivers/net/ethernet/mellanox/mlx4/profile.c
@@ -126,8 +126,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 	profile[MLX4_RES_AUXC].num    = request->num_qp;
 	profile[MLX4_RES_SRQ].num     = request->num_srq;
 	profile[MLX4_RES_CQ].num      = request->num_cq;
-	profile[MLX4_RES_EQ].num      = mlx4_is_mfunc(dev) ?
-					dev->phys_caps.num_phys_eqs :
+	profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? dev->phys_caps.num_phys_eqs :
 					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
 	profile[MLX4_RES_DMPT].num    = request->num_mpt;
 	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
@@ -216,10 +215,18 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 			init_hca->log_num_cqs = profile[i].log_num;
 			break;
 		case MLX4_RES_EQ:
-			dev->caps.num_eqs     = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs,
-									 MAX_MSIX));
-			init_hca->eqc_base    = profile[i].start;
-			init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
+			if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+				init_hca->log_num_eqs = 0x1f;
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->num_sys_eqs = dev_cap->num_sys_eqs;
+			} else {
+				dev->caps.num_eqs     = roundup_pow_of_two(
+								min_t(unsigned,
+								      dev_cap->max_eqs,
+								      MAX_MSIX));
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
+			}
 			break;
 		case MLX4_RES_DMPT:
 			dev->caps.num_mpts	= profile[i].num;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3d9bff00f24a..1c560eb870ad 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -189,7 +189,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
 	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
 	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
-	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16
+	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
+	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17
 };
 
 enum {
@@ -443,6 +444,7 @@ struct mlx4_caps {
 	int			num_cqs;
 	int			max_cqes;
 	int			reserved_cqs;
+	int			num_sys_eqs;
 	int			num_eqs;
 	int			reserved_eqs;
 	int			num_comp_vectors;
-- 
cgit v1.2.3-58-ga151


From de966c5928026b100a989c8cef761d306310a184 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 13 Nov 2014 14:45:33 +0200
Subject: net/mlx4_core: Support more than 64 VFs

We now allow up to 126 VFs. Note though that certain firmware
versions only allow up to 80 VFs. Moreover, old HCAs only support 64 VFs.
In these cases, we limit the maximum number of VFs to 64.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  5 ++++-
 drivers/net/ethernet/mellanox/mlx4/main.c | 24 ++++++++++++++++++++++++
 include/linux/mlx4/device.h               |  5 +++--
 3 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index d2f594fadfbf..4251f81a0275 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -143,7 +143,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[14] = "Ethernet protocol control support",
 		[15] = "Ethernet Backplane autoneg support",
 		[16] = "CONFIG DEV support",
-		[17] = "Asymmetric EQs support"
+		[17] = "Asymmetric EQs support",
+		[18] = "More than 80 VFs support"
 	};
 	int i;
 
@@ -860,6 +861,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
 	if (field32 & (1 << 20))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+	if (field32 & (1 << 21))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
 
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index ebb279060a25..3044f9e623cb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2373,6 +2373,24 @@ disable_sriov:
 	return dev_flags & ~MLX4_FLAG_MASTER;
 }
 
+enum {
+	MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64 = -1,
+};
+
+static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			      int *nvfs)
+{
+	int requested_vfs = nvfs[0] + nvfs[1] + nvfs[2];
+	/* Checking for 64 VFs as a limitation of CX2 */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_80_VFS) &&
+	    requested_vfs >= 64) {
+		mlx4_err(dev, "Requested %d VFs, but FW does not support more than 64\n",
+			 requested_vfs);
+		return MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64;
+	}
+	return 0;
+}
+
 static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 			 int total_vfs, int *nvfs, struct mlx4_priv *priv)
 {
@@ -2484,6 +2502,9 @@ slave_start:
 				goto err_fw;
 			}
 
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+
 			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
 				u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
 								  existing_vfs);
@@ -2512,6 +2533,9 @@ slave_start:
 				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
 				goto err_fw;
 			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
 		}
 	}
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1c560eb870ad..cf09e65c2901 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -95,7 +95,7 @@ enum {
 
 enum {
 	MLX4_MAX_NUM_PF		= 16,
-	MLX4_MAX_NUM_VF		= 64,
+	MLX4_MAX_NUM_VF		= 126,
 	MLX4_MAX_NUM_VF_P_PORT  = 64,
 	MLX4_MFUNC_MAX		= 80,
 	MLX4_MAX_EQ_NUM		= 1024,
@@ -190,7 +190,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
 	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
-	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17
+	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18
 };
 
 enum {
-- 
cgit v1.2.3-58-ga151


From 6eba82248ef47fd478f940a418429e3ec95cb3db Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 13 Nov 2014 13:45:46 +0100
Subject: rhashtable: Drop gfp_flags arg in insert/remove functions

Reallocation is only required for shrinking and expanding and both rely
on a mutex for synchronization and callers of rhashtable_init() are in
non atomic context. Therefore, no reason to continue passing allocation
hints through the API.

Instead, use GFP_KERNEL and add __GFP_NOWARN | __GFP_NORETRY to allow
for silent fall back to vzalloc() without the OOM killer jumping in as
pointed out by Eric Dumazet and Eric W. Biederman.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 10 +++++-----
 lib/rhashtable.c           | 41 +++++++++++++++++------------------------
 net/netfilter/nft_hash.c   |  4 ++--
 net/netlink/af_netlink.c   |  4 ++--
 4 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 473e26bdb91d..b93fd89b2e5e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -99,16 +99,16 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len);
 u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
 
-void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t);
-bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t);
+void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
+bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev, gfp_t flags);
+			     struct rhash_head __rcu **pprev);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
 
-int rhashtable_expand(struct rhashtable *ht, gfp_t flags);
-int rhashtable_shrink(struct rhashtable *ht, gfp_t flags);
+int rhashtable_expand(struct rhashtable *ht);
+int rhashtable_shrink(struct rhashtable *ht);
 
 void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4b4b53bfa08b..25e4c213b08a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -107,13 +107,13 @@ static u32 head_hashfn(const struct rhashtable *ht,
 	return obj_hashfn(ht, rht_obj(ht, he), hsize);
 }
 
-static struct bucket_table *bucket_table_alloc(size_t nbuckets, gfp_t flags)
+static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 {
 	struct bucket_table *tbl;
 	size_t size;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-	tbl = kzalloc(size, flags);
+	tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 	if (tbl == NULL)
 		tbl = vzalloc(size);
 
@@ -200,7 +200,6 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 /**
  * rhashtable_expand - Expand hash table while allowing concurrent lookups
  * @ht:		the hash table to expand
- * @flags:	allocation flags
  *
  * A secondary bucket array is allocated and the hash entries are migrated
  * while keeping them on both lists until the end of the RCU grace period.
@@ -211,7 +210,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
  */
-int rhashtable_expand(struct rhashtable *ht, gfp_t flags)
+int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
@@ -223,7 +222,7 @@ int rhashtable_expand(struct rhashtable *ht, gfp_t flags)
 	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
 		return 0;
 
-	new_tbl = bucket_table_alloc(old_tbl->size * 2, flags);
+	new_tbl = bucket_table_alloc(old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -281,7 +280,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 /**
  * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
  * @ht:		the hash table to shrink
- * @flags:	allocation flags
  *
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
@@ -289,7 +287,7 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
  */
-int rhashtable_shrink(struct rhashtable *ht, gfp_t flags)
+int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head __rcu **pprev;
@@ -300,7 +298,7 @@ int rhashtable_shrink(struct rhashtable *ht, gfp_t flags)
 	if (ht->shift <= ht->p.min_shift)
 		return 0;
 
-	ntbl = bucket_table_alloc(tbl->size / 2, flags);
+	ntbl = bucket_table_alloc(tbl->size / 2);
 	if (ntbl == NULL)
 		return -ENOMEM;
 
@@ -341,7 +339,6 @@ EXPORT_SYMBOL_GPL(rhashtable_shrink);
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
- * @flags:	allocation flags (table expansion)
  *
  * Will automatically grow the table via rhashtable_expand() if the the
  * grow_decision function specified at rhashtable_init() returns true.
@@ -349,8 +346,7 @@ EXPORT_SYMBOL_GPL(rhashtable_shrink);
  * The caller must ensure that no concurrent table mutations occur. It is
  * however valid to have concurrent lookups if they are RCU protected.
  */
-void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
-		       gfp_t flags)
+void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
 	u32 hash;
@@ -363,7 +359,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	ht->nelems++;
 
 	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		rhashtable_expand(ht, flags);
+		rhashtable_expand(ht);
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
@@ -372,14 +368,13 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  * @pprev:	pointer to previous element
- * @flags:	allocation flags (table expansion)
  *
  * Identical to rhashtable_remove() but caller is alreayd aware of the element
  * in front of the element to be deleted. This is in particular useful for
  * deletion when combined with walking or lookup.
  */
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev, gfp_t flags)
+			     struct rhash_head __rcu **pprev)
 {
 	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
 
@@ -390,7 +385,7 @@ void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
 
 	if (ht->p.shrink_decision &&
 	    ht->p.shrink_decision(ht, tbl->size))
-		rhashtable_shrink(ht, flags);
+		rhashtable_shrink(ht);
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
 
@@ -398,7 +393,6 @@ EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
  * rhashtable_remove - remove object from hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
- * @flags:	allocation flags (table expansion)
  *
  * Since the hash chain is single linked, the removal operation needs to
  * walk the bucket chain upon removal. The removal operation is thus
@@ -410,8 +404,7 @@ EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
  * The caller must ensure that no concurrent table mutations occur. It is
  * however valid to have concurrent lookups if they are RCU protected.
  */
-bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj,
-		       gfp_t flags)
+bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head __rcu **pprev;
@@ -429,7 +422,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj,
 			continue;
 		}
 
-		rhashtable_remove_pprev(ht, he, pprev, flags);
+		rhashtable_remove_pprev(ht, he, pprev);
 		return true;
 	}
 
@@ -576,7 +569,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
-	tbl = bucket_table_alloc(size, GFP_KERNEL);
+	tbl = bucket_table_alloc(size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
@@ -713,7 +706,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 		obj->ptr = TEST_PTR;
 		obj->value = i * 2;
 
-		rhashtable_insert(ht, &obj->node, GFP_KERNEL);
+		rhashtable_insert(ht, &obj->node);
 	}
 
 	rcu_read_lock();
@@ -724,7 +717,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table expansion iteration %u...\n", i);
-		rhashtable_expand(ht, GFP_KERNEL);
+		rhashtable_expand(ht);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -734,7 +727,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table shrinkage iteration %u...\n", i);
-		rhashtable_shrink(ht, GFP_KERNEL);
+		rhashtable_shrink(ht);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -749,7 +742,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 		obj = rhashtable_lookup(ht, &key);
 		BUG_ON(!obj);
 
-		rhashtable_remove(ht, &obj->node, GFP_KERNEL);
+		rhashtable_remove(ht, &obj->node);
 		kfree(obj);
 	}
 
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3f75aaaf9d06..1e316ce4cb5d 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -65,7 +65,7 @@ static int nft_hash_insert(const struct nft_set *set,
 	if (set->flags & NFT_SET_MAP)
 		nft_data_copy(he->data, &elem->data);
 
-	rhashtable_insert(priv, &he->node, GFP_KERNEL);
+	rhashtable_insert(priv, &he->node);
 
 	return 0;
 }
@@ -88,7 +88,7 @@ static void nft_hash_remove(const struct nft_set *set,
 	pprev = elem->cookie;
 	he = rht_dereference((*pprev), priv);
 
-	rhashtable_remove_pprev(priv, he, pprev, GFP_KERNEL);
+	rhashtable_remove_pprev(priv, he, pprev);
 
 	synchronize_rcu();
 	kfree(he);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 9e0628cfdf67..a491c1a4861f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1092,7 +1092,7 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
 
 	nlk_sk(sk)->portid = portid;
 	sock_hold(sk);
-	rhashtable_insert(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL);
+	rhashtable_insert(&table->hash, &nlk_sk(sk)->node);
 	err = 0;
 err:
 	mutex_unlock(&nl_sk_hash_lock);
@@ -1105,7 +1105,7 @@ static void netlink_remove(struct sock *sk)
 
 	mutex_lock(&nl_sk_hash_lock);
 	table = &nl_table[sk->sk_protocol];
-	if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL)) {
+	if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node)) {
 		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
-- 
cgit v1.2.3-58-ga151


From a77f9c5dcdf8480a93332792c336fa2bf9d31229 Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <jay.vosburgh@canonical.com>
Date: Fri, 14 Nov 2014 11:05:06 -0800
Subject: Revert "fast_hash: avoid indirect function calls"

This reverts commit e5a2c899957659cd1a9f789bc462f9c0b35f5150.

	Commit e5a2c899 introduced an alternative_call, arch_fast_hash2,
that selects between __jhash2 and __intel_crc4_2_hash based on the
X86_FEATURE_XMM4_2.

	Unfortunately, the alternative_call system does not appear to be
suitable for use with C functions, as register usage is not handled
properly for the called functions.  The __jhash2 function in particular
clobbers registers that are not preserved when called via
alternative_call, resulting in a panic for direct callers of
arch_fast_hash2 on older CPUs lacking sse4_2.  It is possible that
__intel_crc4_2_hash works merely by chance because it uses fewer
registers.

	This commit was suggested as the source of the problem by Jesse
Gross <jesse@nicira.com>.

Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/include/asm/hash.h | 51 +++++----------------------------------------
 arch/x86/lib/hash.c         | 29 +++++++++++---------------
 include/asm-generic/hash.h  | 36 ++------------------------------
 include/linux/hash.h        | 34 ++++++++++++++++++++++++++++++
 lib/Makefile                |  2 +-
 lib/hash.c                  | 39 ++++++++++++++++++++++++++++++++++
 6 files changed, 93 insertions(+), 98 deletions(-)
 create mode 100644 lib/hash.c

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
index a881d784f044..e8c58f88b1d4 100644
--- a/arch/x86/include/asm/hash.h
+++ b/arch/x86/include/asm/hash.h
@@ -1,48 +1,7 @@
-#ifndef __ASM_X86_HASH_H
-#define __ASM_X86_HASH_H
+#ifndef _ASM_X86_HASH_H
+#define _ASM_X86_HASH_H
 
-#include <linux/cpufeature.h>
-#include <asm/alternative.h>
+struct fast_hash_ops;
+extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
 
-u32 __intel_crc4_2_hash(const void *data, u32 len, u32 seed);
-u32 __intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed);
-
-/*
- * non-inline versions of jhash so gcc does not need to generate
- * duplicate code in every object file
- */
-u32 __jhash(const void *data, u32 len, u32 seed);
-u32 __jhash2(const u32 *data, u32 len, u32 seed);
-
-/*
- * for documentation of these functions please look into
- * <include/asm-generic/hash.h>
- */
-
-static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	u32 hash;
-
-	alternative_call(__jhash, __intel_crc4_2_hash, X86_FEATURE_XMM4_2,
-#ifdef CONFIG_X86_64
-			 "=a" (hash), "D" (data), "S" (len), "d" (seed));
-#else
-			 "=a" (hash), "a" (data), "d" (len), "c" (seed));
-#endif
-	return hash;
-}
-
-static inline u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
-{
-	u32 hash;
-
-	alternative_call(__jhash2, __intel_crc4_2_hash2, X86_FEATURE_XMM4_2,
-#ifdef CONFIG_X86_64
-			 "=a" (hash), "D" (data), "S" (len), "d" (seed));
-#else
-			 "=a" (hash), "a" (data), "d" (len), "c" (seed));
-#endif
-	return hash;
-}
-
-#endif /* __ASM_X86_HASH_H */
+#endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
index e14327198835..ff4fa51a5b1f 100644
--- a/arch/x86/lib/hash.c
+++ b/arch/x86/lib/hash.c
@@ -31,13 +31,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/hash.h>
+#include <linux/init.h>
+
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 #include <asm/hash.h>
 
-#include <linux/hash.h>
-#include <linux/jhash.h>
-
 static inline u32 crc32_u32(u32 crc, u32 val)
 {
 #ifdef CONFIG_AS_CRC32
@@ -48,7 +48,7 @@ static inline u32 crc32_u32(u32 crc, u32 val)
 	return crc;
 }
 
-u32 __intel_crc4_2_hash(const void *data, u32 len, u32 seed)
+static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
 {
 	const u32 *p32 = (const u32 *) data;
 	u32 i, tmp = 0;
@@ -71,27 +71,22 @@ u32 __intel_crc4_2_hash(const void *data, u32 len, u32 seed)
 
 	return seed;
 }
-EXPORT_SYMBOL(__intel_crc4_2_hash);
 
-u32 __intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
+static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
 {
+	const u32 *p32 = (const u32 *) data;
 	u32 i;
 
 	for (i = 0; i < len; i++)
-		seed = crc32_u32(seed, *data++);
+		seed = crc32_u32(seed, *p32++);
 
 	return seed;
 }
-EXPORT_SYMBOL(__intel_crc4_2_hash2);
 
-u32 __jhash(const void *data, u32 len, u32 seed)
+void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
 {
-	return jhash(data, len, seed);
-}
-EXPORT_SYMBOL(__jhash);
-
-u32 __jhash2(const u32 *data, u32 len, u32 seed)
-{
-	return jhash2(data, len, seed);
+	if (cpu_has_xmm4_2) {
+		ops->hash  = intel_crc4_2_hash;
+		ops->hash2 = intel_crc4_2_hash2;
+	}
 }
-EXPORT_SYMBOL(__jhash2);
diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
index 3c82760ff2a4..b6312843dbd9 100644
--- a/include/asm-generic/hash.h
+++ b/include/asm-generic/hash.h
@@ -1,41 +1,9 @@
 #ifndef __ASM_GENERIC_HASH_H
 #define __ASM_GENERIC_HASH_H
 
-#include <linux/jhash.h>
-
-/**
- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
- *			 arbitrary size. This function will eventually use an
- *			 architecture-optimized hashing implementation if
- *			 available, and trades off distribution for speed.
- *
- *	@data: buffer to hash
- *	@len: length of buffer in bytes
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	return jhash(data, len, seed);
-}
-
-/**
- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
- *			  size that is of a multiple of 32bit words. This
- *			  function will eventually use an architecture-
- *			  optimized hashing implementation if available,
- *			  and trades off distribution for speed.
- *
- *	@data: buffer to hash (must be 32bit padded)
- *	@len: number of 32bit words
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-static inline u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
+struct fast_hash_ops;
+static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
 {
-	return jhash2(data, len, seed);
 }
 
 #endif /* __ASM_GENERIC_HASH_H */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index 6e8fb028848c..d0494c399392 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -84,4 +84,38 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }
 
+struct fast_hash_ops {
+	u32 (*hash)(const void *data, u32 len, u32 seed);
+	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
+};
+
+/**
+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
+ *			 arbitrary size. This function will eventually use an
+ *			 architecture-optimized hashing implementation if
+ *			 available, and trades off distribution for speed.
+ *
+ *	@data: buffer to hash
+ *	@len: length of buffer in bytes
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
+
+/**
+ *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
+ *			  size that is of a multiple of 32bit words. This
+ *			  function will eventually use an architecture-
+ *			  optimized hashing implementation if available,
+ *			  and trades off distribution for speed.
+ *
+ *	@data: buffer to hash (must be 32bit padded)
+ *	@len: number of 32bit words
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
+
 #endif /* _LINUX_HASH_H */
diff --git a/lib/Makefile b/lib/Makefile
index 04e53dd16070..7512dc978f18 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o rhashtable.o
+	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/hash.c b/lib/hash.c
new file mode 100644
index 000000000000..fea973f4bd57
--- /dev/null
+++ b/lib/hash.c
@@ -0,0 +1,39 @@
+/* General purpose hashing library
+ *
+ * That's a start of a kernel hashing library, which can be extended
+ * with further algorithms in future. arch_fast_hash{2,}() will
+ * eventually resolve to an architecture optimized implementation.
+ *
+ * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
+ * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
+ * Copyright 2013 Thomas Graf <tgraf@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+#include <linux/jhash.h>
+#include <linux/hash.h>
+#include <linux/cache.h>
+
+static struct fast_hash_ops arch_hash_ops __read_mostly = {
+	.hash  = jhash,
+	.hash2 = jhash2,
+};
+
+u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	return arch_hash_ops.hash(data, len, seed);
+}
+EXPORT_SYMBOL_GPL(arch_fast_hash);
+
+u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
+{
+	return arch_hash_ops.hash2(data, len, seed);
+}
+EXPORT_SYMBOL_GPL(arch_fast_hash2);
+
+static int __init hashlib_init(void)
+{
+	setup_arch_fast_hash(&arch_hash_ops);
+	return 0;
+}
+early_initcall(hashlib_init);
-- 
cgit v1.2.3-58-ga151


From 960fb622f85180f36d3aff82af53e2be3db2f888 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 16 Nov 2014 06:23:05 -0800
Subject: net: provide a per host RSS key generic infrastructure

RSS (Receive Side Scaling) typically uses Toeplitz hash and a 40 or 52 bytes
RSS key.

Some drivers use a constant (and well known key), some drivers use a random
key per port, making bonding setups hard to tune. Well known keys increase
attack surface, considering that number of queues is usually a power of two.

This patch provides infrastructure to help drivers doing the right thing.

netdev_rss_key_fill() should be used by drivers to initialize their RSS key,
even if they provide ethtool -X support to let user redefine the key later.

A new /proc/sys/net/core/netdev_rss_key file can be used to get the host
RSS key even for drivers not providing ethtool -x support, in case some
applications want to precisely setup flows to match some RX queues.

Tested:

myhost:~# cat /proc/sys/net/core/netdev_rss_key
11:63:99:bb:79:fb:a5:a7:07:45:b2:20:bf:02:42:2d:08:1a:dd:19:2b:6b:23:ac:56:28:9d:70:c3:ac:e8:16:4b:b7:c1:10:53:a4:78:41:36:40:74:b6:15:ca:27:44:aa:b3:4d:72

myhost:~# ethtool -x eth0
RX flow hash indirection table for eth0 with 8 RX ring(s):
    0:      0     1     2     3     4     5     6     7
RSS hash key:
11:63:99:bb:79:fb:a5:a7:07:45:b2:20:bf:02:42:2d:08:1a:dd:19:2b:6b:23:ac:56:28:9d:70:c3:ac:e8:16:4b:b7:c1:10:53:a4:78:41

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 22 ++++++++++++++++++++++
 include/linux/netdevice.h    |  6 ++++++
 net/core/ethtool.c           | 11 +++++++++++
 net/core/sysctl_net_core.c   | 19 +++++++++++++++++++
 4 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index e26c607468a6..666594b43cff 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -142,6 +142,28 @@ netdev_max_backlog
 Maximum number  of  packets,  queued  on  the  INPUT  side, when the interface
 receives packets faster than kernel can process them.
 
+netdev_rss_key
+--------------
+
+RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is
+randomly generated.
+Some user space might need to gather its content even if drivers do not
+provide ethtool -x support yet.
+
+myhost:~# cat /proc/sys/net/core/netdev_rss_key
+84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total)
+
+File contains nul bytes if no driver ever called netdev_rss_key_fill() function.
+Note:
+/proc/sys/net/core/netdev_rss_key contains 52 bytes of key,
+but most drivers only use 40 bytes of it.
+
+myhost:~# ethtool -x eth0
+RX flow hash indirection table for eth0 with 8 RX ring(s):
+    0:    0     1     2     3     4     5     6     7
+RSS hash key:
+84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89
+
 netdev_tstamp_prequeue
 ----------------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4a6f770377d3..db63cf459ba1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3422,6 +3422,12 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
+
+/* RSS keys are 40 or 52 bytes long */
+#define NETDEV_RSS_KEY_LEN 52
+extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+void netdev_rss_key_fill(void *buffer, size_t len);
+
 int dev_get_nest_level(struct net_device *dev,
 		       bool (*type_check)(struct net_device *dev));
 int skb_checksum_help(struct sk_buff *skb);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b0f84f5ddda8..715f51f321e9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/rtnetlink.h>
 #include <linux/sched.h>
+#include <linux/net.h>
 
 /*
  * Some useful ethtool_ops methods that're device independent.
@@ -573,6 +574,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
 	return 0;
 }
 
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+	BUG_ON(len > sizeof(netdev_rss_key));
+	net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+	memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
 static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f93f092fe226..31baba2a71ce 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -217,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write,
 }
 #endif
 
+static int proc_do_rss_key(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table fake_table;
+	char buf[NETDEV_RSS_KEY_LEN * 3];
+
+	snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+	fake_table.data = buf;
+	fake_table.maxlen = sizeof(buf);
+	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -265,6 +277,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "netdev_rss_key",
+		.data		= &netdev_rss_key,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_do_rss_key,
+	},
 #ifdef CONFIG_BPF_JIT
 	{
 		.procname	= "bpf_jit_enable",
-- 
cgit v1.2.3-58-ga151


From b9d1ab7eb42ede51ffbb6cafffd0a521b30c12e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 16 Nov 2014 06:23:16 -0800
Subject: mlx4: use netdev_rss_key_fill() helper

Use of well known RSS key increases attack surface.
Switch to a random one, using generic helper so that all
ports share a common key.

Also provide ethtool -x support to fetch RSS key

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 9 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      | 6 +-----
 include/linux/mlx4/qp.h                         | 4 +++-
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 6c643230a5ed..710cf309962a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -973,6 +973,11 @@ static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev)
 	return priv->rx_ring_num;
 }
 
+static u32 mlx4_en_get_rxfh_key_size(struct net_device *netdev)
+{
+	return MLX4_EN_RSS_KEY_SIZE;
+}
+
 static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -988,7 +993,8 @@ static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
 		ring_index[n] = rss_map->qps[n % rss_rings].qpn -
 			rss_map->base_qpn;
 	}
-
+	if (key)
+		netdev_rss_key_fill(key, MLX4_EN_RSS_KEY_SIZE);
 	return err;
 }
 
@@ -1799,6 +1805,7 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_rxnfc = mlx4_en_get_rxnfc,
 	.set_rxnfc = mlx4_en_set_rxnfc,
 	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
+	.get_rxfh_key_size = mlx4_en_get_rxfh_key_size,
 	.get_rxfh = mlx4_en_get_rxfh,
 	.set_rxfh = mlx4_en_set_rxfh,
 	.get_channels = mlx4_en_get_channels,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index ccd95177ea7c..b7bda8956011 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1169,9 +1169,6 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	int i, qpn;
 	int err = 0;
 	int good_qps = 0;
-	static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC,
-				0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD,
-				0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
 
 	en_dbg(DRV, priv, "Configuring rss steering\n");
 	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
@@ -1226,8 +1223,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 
 	rss_context->flags = rss_mask;
 	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
-	for (i = 0; i < 10; i++)
-		rss_context->rss_key[i] = cpu_to_be32(rsskey[i]);
+	netdev_rss_key_fill(rss_context->rss_key, MLX4_EN_RSS_KEY_SIZE);
 
 	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
 			       &rss_map->indir_qp, &rss_map->indir_state);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 5f4e36cf0091..467ccdf94c98 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -120,13 +120,15 @@ enum {
 	MLX4_RSS_QPC_FLAG_OFFSET		= 13,
 };
 
+#define MLX4_EN_RSS_KEY_SIZE 40
+
 struct mlx4_rss_context {
 	__be32			base_qpn;
 	__be32			default_qpn;
 	u16			reserved;
 	u8			hash_fn;
 	u8			flags;
-	__be32			rss_key[10];
+	__be32			rss_key[MLX4_EN_RSS_KEY_SIZE / sizeof(__be32)];
 	__be32			base_qpn_udp;
 };
 
-- 
cgit v1.2.3-58-ga151


From cb41c8dd01d74d091618f72e28f0282f064a9f0a Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Mon, 17 Nov 2014 08:20:54 +0100
Subject: ieee802154: rename and move WPAN_NUM_ defines

This patch moves the 802.15.4 constraints WPAN_NUM_ defines into
"net/ieee802154.h" which should contain all necessary 802.15.4 related
information. Also rename these defines to a common name which is
IEEE802154_MAX_CHANNEL and IEEE802154_MAX_PAGE.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h |  3 +++
 include/net/cfg802154.h    | 11 ++---------
 net/ieee802154/nl802154.c  |  4 ++--
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 5a40c0418438..6e82d888287c 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -39,6 +39,9 @@
 #define IEEE802154_LIFS_PERIOD		40
 #define IEEE802154_SIFS_PERIOD		12
 
+#define IEEE802154_MAX_CHANNEL		26
+#define IEEE802154_MAX_PAGE		31
+
 #define IEEE802154_FC_TYPE_BEACON	0x0	/* Frame is beacon */
 #define	IEEE802154_FC_TYPE_DATA		0x1	/* Frame is data */
 #define IEEE802154_FC_TYPE_ACK		0x2	/* Frame is acknowledgment */
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 507ac9d3d38a..228f1f7668f7 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -17,20 +17,13 @@
 #ifndef __NET_CFG802154_H
 #define __NET_CFG802154_H
 
+#include <linux/ieee802154.h>
 #include <linux/netdevice.h>
 #include <linux/mutex.h>
 #include <linux/bug.h>
 
 #include <net/nl802154.h>
 
-/* According to the IEEE 802.15.4 stadard the upper most significant bits of
- * the 32-bit channel bitmaps shall be used as an integer value to specify 32
- * possible channel pages. The lower 27 bits of the channel bit map shall be
- * used as a bit mask to specify channel numbers within a channel page.
- */
-#define WPAN_NUM_CHANNELS	27
-#define WPAN_NUM_PAGES		32
-
 struct wpan_phy;
 
 struct cfg802154_ops {
@@ -81,7 +74,7 @@ struct wpan_phy {
 	 */
 	u8 current_channel;
 	u8 current_page;
-	u32 channels_supported[32];
+	u32 channels_supported[IEEE802154_MAX_PAGE + 1];
 	s8 transmit_power;
 	u8 cca_mode;
 
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 6b9bc93944a6..b82b01669a67 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -245,7 +245,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
 	if (!nl_page)
 		return -ENOBUFS;
 
-	for (page = 0; page < WPAN_NUM_PAGES; page++) {
+	for (page = 0; page <= IEEE802154_MAX_PAGE; page++) {
 		if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL,
 				rdev->wpan_phy.channels_supported[page]))
 			return -ENOBUFS;
@@ -616,7 +616,7 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
 	channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]);
 
 	/* check 802.15.4 constraints */
-	if (page >= WPAN_NUM_PAGES || channel >= WPAN_NUM_CHANNELS)
+	if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL)
 		return -EINVAL;
 
 	return rdev_set_channel(rdev, page, channel);
-- 
cgit v1.2.3-58-ga151


From 3274f52073d88b62f3c5ace82ae9d48546232e72 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 13 Nov 2014 17:36:44 -0800
Subject: bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command

the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
 #define BPF_ANY	0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */

bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.

bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.

Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags;
    };

    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 2 +-
 include/uapi/linux/bpf.h | 8 +++++++-
 kernel/bpf/syscall.c     | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cf91754a957..51e9242e4803 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,7 +22,7 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d18316f9e9c4..3e9e1b77f29d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,7 +82,7 @@ enum bpf_cmd {
 
 	/* create or update key/value pair in a given map
 	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
-	 * Using attr->map_fd, attr->key, attr->value
+	 * Using attr->map_fd, attr->key, attr->value, attr->flags
 	 * returns zero or negative error
 	 */
 	BPF_MAP_UPDATE_ELEM,
@@ -117,6 +117,11 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 };
 
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY		0 /* create new element or update existing */
+#define BPF_NOEXIST	1 /* create new element if it didn't exist */
+#define BPF_EXIST	2 /* update existing element */
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -132,6 +137,7 @@ union bpf_attr {
 			__aligned_u64 value;
 			__aligned_u64 next_key;
 		};
+		__u64		flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..c0d03bf317a2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -190,7 +190,7 @@ err_put:
 	return err;
 }
 
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
 {
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
 	 * therefore all map accessors rely on this fact, so do the same here
 	 */
 	rcu_read_lock();
-	err = map->ops->map_update_elem(map, key, value);
+	err = map->ops->map_update_elem(map, key, value, attr->flags);
 	rcu_read_unlock();
 
 free_value:
-- 
cgit v1.2.3-58-ga151


From d0003ec01c667b731c139e23de3306a8b328ccf5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 13 Nov 2014 17:36:49 -0800
Subject: bpf: allow eBPF programs to use maps

expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
map accessors to eBPF programs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  5 +++
 include/uapi/linux/bpf.h |  3 ++
 kernel/bpf/Makefile      |  2 +-
 kernel/bpf/helpers.c     | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/helpers.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e9242e4803..75e94eaa228b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
+/* verifier prototypes for helper functions called from eBPF programs */
+extern struct bpf_func_proto bpf_map_lookup_elem_proto;
+extern struct bpf_func_proto bpf_map_update_elem_proto;
+extern struct bpf_func_proto bpf_map_delete_elem_proto;
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d662fe75df5..4a3d0f84f178 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -158,6 +158,9 @@ union bpf_attr {
  */
 enum bpf_func_id {
 	BPF_FUNC_unspec,
+	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
+	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
+	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 72ec98ba2d42..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* verifier checked that R1 contains a valid pointer to bpf_map
+	 * and R2 points to a program stack and map->key_size bytes were
+	 * initialized
+	 */
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	value = map->ops->map_lookup_elem(map, key);
+
+	/* lookup() returns either pointer to element value or NULL
+	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+	 */
+	return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+	.func = bpf_map_lookup_elem,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value = (void *) (unsigned long) r3;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+	.func = bpf_map_update_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.arg3_type = ARG_PTR_TO_MAP_VALUE,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+	.func = bpf_map_delete_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
-- 
cgit v1.2.3-58-ga151


From 1277b4a9f531e84e26f9e0210c1801b0c0bf81ca Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Sun, 9 Nov 2014 18:50:08 +0200
Subject: mac80211: retransmit TDLS teardown packet through AP if not ACKed

Since the TDLS peer station might not receive the teardown
packet (e.g., when in PS), this makes sure the packet is
retransmitted - this time through the AP - if the TDLS peer
didn't ACK the packet.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 25 ++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  4 ++++
 net/mac80211/mlme.c        | 12 ++++++++++
 net/mac80211/status.c      | 55 ++++++++++++++++++++++++++++++++++++++++--
 net/mac80211/tdls.c        | 59 +++++++++++++++++++++++++++++++++++++---------
 5 files changed, 142 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f65b5446d983..4e2bb9107878 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/if_ether.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 
 /*
  * DS bit usage
@@ -2418,6 +2419,30 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 	return !!(tim->virtual_map[index] & mask);
 }
 
+/**
+ * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet)
+ * @skb: the skb containing the frame, length will not be checked
+ * @hdr_size: the size of the ieee80211_hdr that starts at skb->data
+ *
+ * This function assumes the frame is a data frame, and that the network header
+ * is in the correct place.
+ */
+static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
+{
+	if (!skb_is_nonlinear(skb) &&
+	    skb->len > (skb_network_offset(skb) + 2)) {
+		/* Point to where the indication of TDLS should start */
+		const u8 *tdls_data = skb_network_header(skb) - 2;
+
+		if (get_unaligned_be16(tdls_data) == ETH_P_TDLS &&
+		    tdls_data[2] == WLAN_TDLS_SNAP_RFTYPE &&
+		    tdls_data[3] == WLAN_CATEGORY_TDLS)
+			return tdls_data[4];
+	}
+
+	return -1;
+}
+
 /* convert time units */
 #define TU_TO_JIFFIES(x)	(usecs_to_jiffies((x) * 1024))
 #define TU_TO_EXP_TIME(x)	(jiffies + TU_TO_JIFFIES(x))
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 208953d1d028..bc6f12ff1f61 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -525,8 +525,12 @@ struct ieee80211_if_managed {
 	struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */
 	struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */
 
+	/* TDLS support */
 	u8 tdls_peer[ETH_ALEN] __aligned(2);
 	struct delayed_work tdls_peer_del_work;
+	struct sk_buff *orig_teardown_skb; /* The original teardown skb */
+	struct sk_buff *teardown_skb; /* A copy to send through the AP */
+	spinlock_t teardown_lock; /* To lock changing teardown_skb */
 
 	/* WMM-AC TSPEC support */
 	struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS];
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 243539878991..11a937f3fdeb 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4003,6 +4003,11 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 		ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC;
 	else
 		ifmgd->req_smps = IEEE80211_SMPS_OFF;
+
+	/* Setup TDLS data */
+	spin_lock_init(&ifmgd->teardown_lock);
+	ifmgd->teardown_skb = NULL;
+	ifmgd->orig_teardown_skb = NULL;
 }
 
 /* scan finished notification */
@@ -4865,6 +4870,13 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 	}
 	if (ifmgd->auth_data)
 		ieee80211_destroy_auth_data(sdata, false);
+	spin_lock_bh(&ifmgd->teardown_lock);
+	if (ifmgd->teardown_skb) {
+		kfree_skb(ifmgd->teardown_skb);
+		ifmgd->teardown_skb = NULL;
+		ifmgd->orig_teardown_skb = NULL;
+	}
+	spin_unlock_bh(&ifmgd->teardown_lock);
 	del_timer_sync(&ifmgd->timer);
 	sdata_unlock(sdata);
 }
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 9612d89fad56..71de2d3866cc 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -390,6 +390,46 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
 	}
 }
 
+/*
+ * Handles the tx for TDLS teardown frames.
+ * If the frame wasn't ACKed by the peer - it will be re-sent through the AP
+ */
+static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local,
+					struct ieee80211_sub_if_data *sdata,
+					struct sk_buff *skb, u32 flags)
+{
+	struct sk_buff *teardown_skb;
+	struct sk_buff *orig_teardown_skb;
+	bool is_teardown = false;
+
+	/* Get the teardown data we need and free the lock */
+	spin_lock(&sdata->u.mgd.teardown_lock);
+	teardown_skb = sdata->u.mgd.teardown_skb;
+	orig_teardown_skb = sdata->u.mgd.orig_teardown_skb;
+	if ((skb == orig_teardown_skb) && teardown_skb) {
+		sdata->u.mgd.teardown_skb = NULL;
+		sdata->u.mgd.orig_teardown_skb = NULL;
+		is_teardown = true;
+	}
+	spin_unlock(&sdata->u.mgd.teardown_lock);
+
+	if (is_teardown) {
+		/* This mechanism relies on being able to get ACKs */
+		WARN_ON(!(local->hw.flags &
+			  IEEE80211_HW_REPORTS_TX_ACK_STATUS));
+
+		/* Check if peer has ACKed */
+		if (flags & IEEE80211_TX_STAT_ACK) {
+			dev_kfree_skb_any(teardown_skb);
+		} else {
+			tdls_dbg(sdata,
+				 "TDLS Resending teardown through AP\n");
+
+			ieee80211_subif_start_xmit(teardown_skb, skb->dev);
+		}
+	}
+}
+
 static void ieee80211_report_used_skb(struct ieee80211_local *local,
 				      struct sk_buff *skb, bool dropped)
 {
@@ -426,8 +466,19 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 		if (!sdata) {
 			skb->dev = NULL;
 		} else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
-			ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control,
-						     acked);
+			unsigned int hdr_size =
+				ieee80211_hdrlen(hdr->frame_control);
+
+			/* Check to see if packet is a TDLS teardown packet */
+			if (ieee80211_is_data(hdr->frame_control) &&
+			    (ieee80211_get_tdls_action(skb, hdr_size) ==
+			     WLAN_TDLS_TEARDOWN))
+				ieee80211_tdls_td_tx_handle(local, sdata, skb,
+							    info->flags);
+			else
+				ieee80211_mgd_conn_tx_status(sdata,
+							     hdr->frame_control,
+							     acked);
 		} else if (ieee80211_is_nullfunc(hdr->frame_control) ||
 			   ieee80211_is_qos_nullfunc(hdr->frame_control)) {
 			cfg80211_probe_status(sdata->dev, hdr->addr1,
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index b4f368e2cb3b..d4fe091fd98a 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -512,20 +512,22 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = NULL;
+	u32 flags = 0;
 	bool send_direct;
 	struct sta_info *sta;
 	int ret;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    max(sizeof(struct ieee80211_mgmt),
-				sizeof(struct ieee80211_tdls_data)) +
-			    50 + /* supported rates */
-			    7 + /* ext capab */
-			    26 + /* max(WMM-info, WMM-param) */
-			    2 + max(sizeof(struct ieee80211_ht_cap),
-				    sizeof(struct ieee80211_ht_operation)) +
-			    extra_ies_len +
-			    sizeof(struct ieee80211_tdls_lnkie));
+	skb = netdev_alloc_skb(dev,
+			       local->hw.extra_tx_headroom +
+			       max(sizeof(struct ieee80211_mgmt),
+				   sizeof(struct ieee80211_tdls_data)) +
+			       50 + /* supported rates */
+			       7 + /* ext capab */
+			       26 + /* max(WMM-info, WMM-param) */
+			       2 + max(sizeof(struct ieee80211_ht_cap),
+				       sizeof(struct ieee80211_ht_operation)) +
+			       extra_ies_len +
+			       sizeof(struct ieee80211_tdls_lnkie));
 	if (!skb)
 		return -ENOMEM;
 
@@ -623,9 +625,44 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
 		break;
 	}
 
+	/*
+	 * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress.
+	 * Later, if no ACK is returned from peer, we will re-send the teardown
+	 * packet through the AP.
+	 */
+	if ((action_code == WLAN_TDLS_TEARDOWN) &&
+	    (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) {
+		struct sta_info *sta = NULL;
+		bool try_resend; /* Should we keep skb for possible resend */
+
+		/* If not sending directly to peer - no point in keeping skb */
+		rcu_read_lock();
+		sta = sta_info_get(sdata, peer);
+		try_resend = sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
+		rcu_read_unlock();
+
+		spin_lock_bh(&sdata->u.mgd.teardown_lock);
+		if (try_resend && !sdata->u.mgd.teardown_skb) {
+			/* Mark it as requiring TX status callback  */
+			flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
+				 IEEE80211_TX_INTFL_MLME_CONN_TX;
+
+			/*
+			 * skb is copied since mac80211 will later set
+			 * properties that might not be the same as the AP,
+			 * such as encryption, QoS, addresses, etc.
+			 *
+			 * No problem if skb_copy() fails, so no need to check.
+			 */
+			sdata->u.mgd.teardown_skb = skb_copy(skb, GFP_ATOMIC);
+			sdata->u.mgd.orig_teardown_skb = skb;
+		}
+		spin_unlock_bh(&sdata->u.mgd.teardown_lock);
+	}
+
 	/* disable bottom halves when entering the Tx path */
 	local_bh_disable();
-	ret = ieee80211_subif_start_xmit(skb, dev);
+	__ieee80211_subif_start_xmit(skb, dev, flags);
 	local_bh_enable();
 
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 2cedd87960a809dd9bf683f72123b7dce6736f07 Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Sun, 9 Nov 2014 18:50:13 +0200
Subject: mac80211: add BSS coex IE to TDLS setup frames

Add the BSS coex IE in case we support HT40 channels, as mandated by
section 8.5.13 in IEEE802.11 2012.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  3 +++
 net/mac80211/tdls.c       | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4e2bb9107878..adac1be67387 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2037,6 +2037,9 @@ enum ieee80211_tdls_actioncode {
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
 
+/* BSS Coex IE information field bits */
+#define WLAN_BSS_COEX_INFORMATION_REQUEST	BIT(0)
+
 /**
  * enum - mesh synchronization method identifier
  *
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 8fb314b182e2..30a4c1004010 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -117,6 +117,16 @@ ieee80211_tdls_add_supp_channels(struct ieee80211_sub_if_data *sdata,
 	*pos = 2 * subband_cnt;
 }
 
+static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb)
+{
+	u8 *pos = (void *)skb_put(skb, 3);
+
+	*pos++ = WLAN_EID_BSS_COEX_2040;
+	*pos++ = 1; /* len */
+
+	*pos++ = WLAN_BSS_COEX_INFORMATION_REQUEST;
+}
+
 static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata,
 					u16 status_code)
 {
@@ -341,6 +351,10 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_unlock();
 
+	if (ht_cap.ht_supported &&
+	    (ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40))
+		ieee80211_tdls_add_bss_coex_ie(skb);
+
 	/* add any remaining IEs */
 	if (extra_ies_len) {
 		noffset = extra_ies_len;
@@ -597,6 +611,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
 			       2 + max(sizeof(struct ieee80211_ht_cap),
 				       sizeof(struct ieee80211_ht_operation)) +
 			       50 + /* supported channels */
+			       3 + /* 40/20 BSS coex */
 			       extra_ies_len +
 			       sizeof(struct ieee80211_tdls_lnkie));
 	if (!skb)
-- 
cgit v1.2.3-58-ga151


From 78632a17eaa7a5abdc22aac8ca5932d6cad59984 Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Sun, 9 Nov 2014 18:50:14 +0200
Subject: cfg/mac80211: define TDLS channel switch feature bit

Define some related TDLS protocol constants and advertise channel switch
support in the extended-capabilities IE when the feature bit is defined.

Actually supporting TDLS channel-switching also requires support for
some new nl80211 commands, to be introduced by future patches.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    | 6 ++++++
 include/uapi/linux/nl80211.h | 3 +++
 net/mac80211/tdls.c          | 9 ++++++---
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index adac1be67387..fbb02d240658 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2019,6 +2019,11 @@ enum ieee80211_tdls_actioncode {
  */
 #define WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING	BIT(2)
 
+/* TDLS capabilities in the the 4th byte of @WLAN_EID_EXT_CAPABILITY */
+#define WLAN_EXT_CAPA4_TDLS_BUFFER_STA		BIT(4)
+#define WLAN_EXT_CAPA4_TDLS_PEER_PSM		BIT(5)
+#define WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH		BIT(6)
+
 /* Interworking capabilities are set in 7th bit of 4th byte of the
  * @WLAN_EID_EXT_CAPABILITY information element
  */
@@ -2030,6 +2035,7 @@ enum ieee80211_tdls_actioncode {
  */
 #define WLAN_EXT_CAPA5_TDLS_ENABLED	BIT(5)
 #define WLAN_EXT_CAPA5_TDLS_PROHIBITED	BIT(6)
+#define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED	BIT(7)
 
 #define WLAN_EXT_CAPA8_OPMODE_NOTIF	BIT(6)
 #define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(7)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 442369f69b4f..ccdeef28d672 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4095,6 +4095,8 @@ enum nl80211_ap_sme_features {
  * @NL80211_FEATURE_MAC_ON_CREATE: Device supports configuring
  *	the vif's MAC address upon creation.
  *	See 'macaddr' field in the vif_params (cfg80211.h).
+ * @NL80211_FEATURE_TDLS_CHANNEL_SWITCH: Driver supports channel switching when
+ *	operating as a TDLS peer.
  */
 enum nl80211_feature_flags {
 	NL80211_FEATURE_SK_TX_STATUS			= 1 << 0,
@@ -4125,6 +4127,7 @@ enum nl80211_feature_flags {
 	NL80211_FEATURE_DYNAMIC_SMPS			= 1 << 25,
 	NL80211_FEATURE_SUPPORTS_WMM_ADMISSION		= 1 << 26,
 	NL80211_FEATURE_MAC_ON_CREATE			= 1 << 27,
+	NL80211_FEATURE_TDLS_CHANNEL_SWITCH		= 1 << 28,
 };
 
 /**
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 30a4c1004010..4554bdc72c91 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -35,16 +35,19 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk)
 	mutex_unlock(&local->mtx);
 }
 
-static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb)
+static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local,
+					 struct sk_buff *skb)
 {
 	u8 *pos = (void *)skb_put(skb, 7);
+	bool chan_switch = local->hw.wiphy->features &
+			   NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
 
 	*pos++ = WLAN_EID_EXT_CAPABILITY;
 	*pos++ = 5; /* len */
 	*pos++ = 0x0;
 	*pos++ = 0x0;
 	*pos++ = 0x0;
-	*pos++ = 0x0;
+	*pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0;
 	*pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED;
 }
 
@@ -289,7 +292,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
 		offset = noffset;
 	}
 
-	ieee80211_tdls_add_ext_capab(skb);
+	ieee80211_tdls_add_ext_capab(local, skb);
 
 	/* add the QoS element if we support it */
 	if (local->hw.queues >= IEEE80211_NUM_ACS &&
-- 
cgit v1.2.3-58-ga151


From 53837584438f8899e061ada4663ae1d09b49b96a Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Sun, 9 Nov 2014 18:50:18 +0200
Subject: mac80211: add parsing of TDLS specific IEs

These are used in TDLS channel switching code.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 15 +++++++++++++++
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/util.c        | 16 ++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index fbb02d240658..4f4eea8a6288 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1067,6 +1067,12 @@ struct ieee80211_pspoll {
 
 /* TDLS */
 
+/* Channel switch timing */
+struct ieee80211_ch_switch_timing {
+	__le16 switch_time;
+	__le16 switch_timeout;
+} __packed;
+
 /* Link-id information element */
 struct ieee80211_tdls_lnkie {
 	u8 ie_type; /* Link Identifier IE */
@@ -1108,6 +1114,15 @@ struct ieee80211_tdls_data {
 			u8 dialog_token;
 			u8 variable[0];
 		} __packed discover_req;
+		struct {
+			u8 target_channel;
+			u8 oper_class;
+			u8 variable[0];
+		} __packed chan_switch_req;
+		struct {
+			__le16 status_code;
+			u8 variable[0];
+		} __packed chan_switch_resp;
 	} u;
 } __packed;
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4b3a7e7ec2a0..e786ab6bc72c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1400,6 +1400,8 @@ struct ieee802_11_elems {
 	size_t total_len;
 
 	/* pointers to IEs */
+	const struct ieee80211_tdls_lnkie *lnk_id;
+	const struct ieee80211_ch_switch_timing *ch_sw_timing;
 	const u8 *ext_capab;
 	const u8 *ssid;
 	const u8 *supp_rates;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3ca0c2e725ff..9e5bfd614856 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -832,6 +832,8 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 		case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
 		case WLAN_EID_CHAN_SWITCH_PARAM:
 		case WLAN_EID_EXT_CAPABILITY:
+		case WLAN_EID_CHAN_SWITCH_TIMING:
+		case WLAN_EID_LINK_ID:
 		/*
 		 * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
 		 * that if the content gets bigger it might be needed more than once
@@ -851,6 +853,20 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 		elem_parse_failed = false;
 
 		switch (id) {
+		case WLAN_EID_LINK_ID:
+			if (elen + 2 != sizeof(struct ieee80211_tdls_lnkie)) {
+				elem_parse_failed = true;
+				break;
+			}
+			elems->lnk_id = (void *)(pos - 2);
+			break;
+		case WLAN_EID_CHAN_SWITCH_TIMING:
+			if (elen != sizeof(struct ieee80211_ch_switch_timing)) {
+				elem_parse_failed = true;
+				break;
+			}
+			elems->ch_sw_timing = (void *)pos;
+			break;
 		case WLAN_EID_EXT_CAPABILITY:
 			elems->ext_capab = pos;
 			elems->ext_capab_len = elen;
-- 
cgit v1.2.3-58-ga151


From 666547ff591cebdedc4679bf6b1b3f3383a8dea3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Apr 2014 14:03:05 -0400
Subject: separate kernel- and userland-side msghdr

Kernel-side struct msghdr is (currently) using the same layout as
userland one, but it's not a one-to-one copy - even without considering
32bit compat issues, we have msg_iov, msg_name and msg_control copied
to kernel[1].  It's fairly localized, so we get away with a few functions
where that knowledge is needed (and we could shrink that set even
more).  Pretty much everything deals with the kernel-side variant and
the few places that want userland one just use a bunch of force-casts
to paper over the differences.

The thing is, kernel-side definition of struct msghdr is *not* exposed
in include/uapi - libc doesn't see it, etc.  So we can add struct user_msghdr,
with proper annotations and let the few places that ever deal with those
beasts use it for userland pointers.  Saner typechecking aside, that will
allow to change the layout of kernel-side msghdr - e.g. replace
msg_iov/msg_iovlen there with struct iov_iter, getting rid of the need
to modify the iovec as we copy data to/from it, etc.

We could introduce kernel_msghdr instead, but that would create much more
noise - the absolute majority of the instances would need to have the
type switched to kernel_msghdr and definition of struct msghdr in
include/linux/socket.h is not going to be seen by userland anyway.

This commit just introduces user_msghdr and switches the few places that
are dealing with userland-side msghdr to it.

[1] actually, it's even trickier than that - we copy msg_control for
sendmsg, but keep the userland address on recvmsg.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm/kernel/sys_oabi-compat.c |  4 ++--
 include/linux/socket.h            | 16 +++++++++++++---
 include/linux/syscalls.h          |  6 +++---
 net/compat.c                      |  4 ++--
 net/socket.c                      | 29 ++++++++++++++++-------------
 5 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index e90a3148f385..b83f3b7737fb 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -400,7 +400,7 @@ asmlinkage long sys_oabi_sendto(int fd, void __user *buff,
 	return sys_sendto(fd, buff, len, flags, addr, addrlen);
 }
 
-asmlinkage long sys_oabi_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
+asmlinkage long sys_oabi_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
 {
 	struct sockaddr __user *addr;
 	int msg_namelen;
@@ -446,7 +446,7 @@ asmlinkage long sys_oabi_socketcall(int call, unsigned long __user *args)
 		break;
 	case SYS_SENDMSG:
 		if (copy_from_user(a, args, 3 * sizeof(long)) == 0)
-			r = sys_oabi_sendmsg(a[0], (struct msghdr __user *)a[1], a[2]);
+			r = sys_oabi_sendmsg(a[0], (struct user_msghdr __user *)a[1], a[2]);
 		break;
 	default:
 		r = sys_socketcall(call, args);
diff --git a/include/linux/socket.h b/include/linux/socket.h
index bb9b83640070..51bd6668b80e 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -53,10 +53,20 @@ struct msghdr {
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	unsigned int	msg_flags;	/* flags on received message */
 };
+ 
+struct user_msghdr {
+	void		__user *msg_name;	/* ptr to socket address structure */
+	int		msg_namelen;		/* size of socket address structure */
+	struct iovec	__user *msg_iov;	/* scatter/gather array */
+	__kernel_size_t	msg_iovlen;		/* # elements in msg_iov */
+	void		__user *msg_control;	/* ancillary data */
+	__kernel_size_t	msg_controllen;		/* ancillary data buffer length */
+	unsigned int	msg_flags;		/* flags on received message */
+};
 
 /* For recvmmsg/sendmmsg */
 struct mmsghdr {
-	struct msghdr   msg_hdr;
+	struct user_msghdr  msg_hdr;
 	unsigned int        msg_len;
 };
 
@@ -319,8 +329,8 @@ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 struct timespec;
 
 /* The __sys_...msg variants allow MSG_CMSG_COMPAT */
-extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
-extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
+extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
+extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
 extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			  unsigned int flags, struct timespec *timeout);
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bda9b81357cc..c9afdc7a7f84 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -25,7 +25,7 @@ struct linux_dirent64;
 struct list_head;
 struct mmap_arg_struct;
 struct msgbuf;
-struct msghdr;
+struct user_msghdr;
 struct mmsghdr;
 struct msqid_ds;
 struct new_utsname;
@@ -601,13 +601,13 @@ asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
 asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int);
-asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
 			     unsigned int vlen, unsigned flags);
 asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
-asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
 asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
 			     unsigned int vlen, unsigned flags,
 			     struct timespec __user *timeout);
diff --git a/net/compat.c b/net/compat.c
index bc8aeefddf3f..562e920b07f0 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -740,7 +740,7 @@ COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, uns
 {
 	if (flags & MSG_CMSG_COMPAT)
 		return -EINVAL;
-	return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+	return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
 COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
@@ -756,7 +756,7 @@ COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, uns
 {
 	if (flags & MSG_CMSG_COMPAT)
 		return -EINVAL;
-	return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+	return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
 COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags)
diff --git a/net/socket.c b/net/socket.c
index fe20c319a0bb..0ae8147e3acc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1989,8 +1989,11 @@ struct used_address {
 };
 
 static int copy_msghdr_from_user(struct msghdr *kmsg,
-				 struct msghdr __user *umsg)
+				 struct user_msghdr __user *umsg)
 {
+	/* We are relying on the (currently) identical layouts.  Once
+	 * the kernel-side changes, this place will need to be updated
+	 */
 	if (copy_from_user(kmsg, umsg, sizeof(struct msghdr)))
 		return -EFAULT;
 
@@ -2005,7 +2008,7 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 	return 0;
 }
 
-static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
+static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags,
 			 struct used_address *used_address)
 {
@@ -2123,7 +2126,7 @@ out:
  *	BSD sendmsg interface
  */
 
-long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
+long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
 {
 	int fput_needed, err;
 	struct msghdr msg_sys;
@@ -2140,7 +2143,7 @@ out:
 	return err;
 }
 
-SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags)
+SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
 {
 	if (flags & MSG_CMSG_COMPAT)
 		return -EINVAL;
@@ -2177,7 +2180,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 
 	while (datagrams < vlen) {
 		if (MSG_CMSG_COMPAT & flags) {
-			err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
+			err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
 					     &msg_sys, flags, &used_address);
 			if (err < 0)
 				break;
@@ -2185,7 +2188,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			++compat_entry;
 		} else {
 			err = ___sys_sendmsg(sock,
-					     (struct msghdr __user *)entry,
+					     (struct user_msghdr __user *)entry,
 					     &msg_sys, flags, &used_address);
 			if (err < 0)
 				break;
@@ -2215,7 +2218,7 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
 	return __sys_sendmmsg(fd, mmsg, vlen, flags);
 }
 
-static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 			 struct msghdr *msg_sys, unsigned int flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
@@ -2311,7 +2314,7 @@ out:
  *	BSD recvmsg interface
  */
 
-long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags)
+long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
 {
 	int fput_needed, err;
 	struct msghdr msg_sys;
@@ -2328,7 +2331,7 @@ out:
 	return err;
 }
 
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
 		unsigned int, flags)
 {
 	if (flags & MSG_CMSG_COMPAT)
@@ -2373,7 +2376,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 		 * No need to ask LSM for more than the first datagram.
 		 */
 		if (MSG_CMSG_COMPAT & flags) {
-			err = ___sys_recvmsg(sock, (struct msghdr __user *)compat_entry,
+			err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
 					     &msg_sys, flags & ~MSG_WAITFORONE,
 					     datagrams);
 			if (err < 0)
@@ -2382,7 +2385,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			++compat_entry;
 		} else {
 			err = ___sys_recvmsg(sock,
-					     (struct msghdr __user *)entry,
+					     (struct user_msghdr __user *)entry,
 					     &msg_sys, flags & ~MSG_WAITFORONE,
 					     datagrams);
 			if (err < 0)
@@ -2571,13 +2574,13 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 				   (int __user *)a[4]);
 		break;
 	case SYS_SENDMSG:
-		err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
+		err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]);
 		break;
 	case SYS_SENDMMSG:
 		err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
 		break;
 	case SYS_RECVMSG:
-		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
+		err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]);
 		break;
 	case SYS_RECVMMSG:
 		err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
-- 
cgit v1.2.3-58-ga151


From 08adb7dabd4874cc5666b4490653b26534702ce0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 10 Nov 2014 20:23:13 -0500
Subject: fold verify_iovec() into copy_msghdr_from_user()

... and do the same on the compat side of things.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/socket.h |  1 -
 include/net/compat.h   |  5 ++-
 net/compat.c           | 52 ++++++++++++----------------
 net/core/iovec.c       | 38 ---------------------
 net/socket.c           | 93 ++++++++++++++++++++++++++++----------------------
 5 files changed, 77 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 51bd6668b80e..de5222832be4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -322,7 +322,6 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata,
 extern unsigned long iov_pages(const struct iovec *iov, int offset,
 			       unsigned long nr_segs);
 
-extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff --git a/include/net/compat.h b/include/net/compat.h
index 3b603b199c01..42a9c8431177 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -40,9 +40,8 @@ int compat_sock_get_timestampns(struct sock *, struct timespec __user *);
 #define compat_mmsghdr	mmsghdr
 #endif /* defined(CONFIG_COMPAT) */
 
-int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
-int verify_compat_iovec(struct msghdr *, struct iovec *,
-			struct sockaddr_storage *, int);
+ssize_t get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
+		      struct sockaddr __user **, struct iovec **);
 asmlinkage long compat_sys_sendmsg(int, struct compat_msghdr __user *,
 				   unsigned int);
 asmlinkage long compat_sys_sendmmsg(int, struct compat_mmsghdr __user *,
diff --git a/net/compat.c b/net/compat.c
index 7b4b6ad13235..062f157d2a6b 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -31,14 +31,18 @@
 #include <asm/uaccess.h>
 #include <net/compat.h>
 
-int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
+ssize_t get_compat_msghdr(struct msghdr *kmsg,
+			  struct compat_msghdr __user *umsg,
+			  struct sockaddr __user **save_addr,
+			  struct iovec **iov)
 {
-	compat_uptr_t tmp1, tmp2, tmp3;
+	compat_uptr_t uaddr, uiov, tmp3;
+	ssize_t err;
 
 	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
-	    __get_user(tmp1, &umsg->msg_name) ||
+	    __get_user(uaddr, &umsg->msg_name) ||
 	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
-	    __get_user(tmp2, &umsg->msg_iov) ||
+	    __get_user(uiov, &umsg->msg_iov) ||
 	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
 	    __get_user(tmp3, &umsg->msg_control) ||
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
@@ -46,44 +50,32 @@ int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
 		return -EFAULT;
 	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
-	kmsg->msg_name = compat_ptr(tmp1);
-	kmsg->msg_iov = compat_ptr(tmp2);
 	kmsg->msg_control = compat_ptr(tmp3);
-	return 0;
-}
 
-/* I've named the args so it is easy to tell whose space the pointers are in. */
-int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *iov,
-		   struct sockaddr_storage *kern_address, int mode)
-{
-	struct compat_iovec __user *p;
-	struct iovec *res;
-	int err;
+	if (save_addr)
+		*save_addr = compat_ptr(uaddr);
 
-	if (kern_msg->msg_name && kern_msg->msg_namelen) {
-		if (mode == WRITE) {
-			int err = move_addr_to_kernel(kern_msg->msg_name,
-						      kern_msg->msg_namelen,
-						      kern_address);
+	if (uaddr && kmsg->msg_namelen) {
+		if (!save_addr) {
+			err = move_addr_to_kernel(compat_ptr(uaddr),
+						  kmsg->msg_namelen,
+						  kmsg->msg_name);
 			if (err < 0)
 				return err;
 		}
-		kern_msg->msg_name = kern_address;
 	} else {
-		kern_msg->msg_name = NULL;
-		kern_msg->msg_namelen = 0;
+		kmsg->msg_name = NULL;
+		kmsg->msg_namelen = 0;
 	}
 
-	if (kern_msg->msg_iovlen > UIO_MAXIOV)
+	if (kmsg->msg_iovlen > UIO_MAXIOV)
 		return -EMSGSIZE;
 
-	p = (struct compat_iovec __user *)kern_msg->msg_iov;
-	err = compat_rw_copy_check_uvector(mode, p, kern_msg->msg_iovlen,
-					   UIO_FASTIOV, iov, &res);
+	err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE,
+					   compat_ptr(uiov), kmsg->msg_iovlen,
+					   UIO_FASTIOV, *iov, iov);
 	if (err >= 0)
-		kern_msg->msg_iov = res;
-	else if (res != iov)
-		kfree(res);
+		kmsg->msg_iov = *iov;
 	return err;
 }
 
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 86beeea61d72..dcbe98b3726a 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -27,44 +27,6 @@
 #include <net/checksum.h>
 #include <net/sock.h>
 
-/*
- *	Verify iovec. The caller must ensure that the iovec is big enough
- *	to hold the message iovec.
- *
- *	Save time not doing access_ok. copy_*_user will make this work
- *	in any case.
- */
-
-int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
-{
-	struct iovec *res;
-	int err;
-
-	if (m->msg_name && m->msg_namelen) {
-		if (mode == WRITE) {
-			void __user *namep = (void __user __force *)m->msg_name;
-			int err = move_addr_to_kernel(namep, m->msg_namelen,
-						  address);
-			if (err < 0)
-				return err;
-		}
-		m->msg_name = address;
-	} else {
-		m->msg_name = NULL;
-		m->msg_namelen = 0;
-	}
-	if (m->msg_iovlen > UIO_MAXIOV)
-		return -EMSGSIZE;
-
-	err = rw_copy_check_uvector(mode, (void __user __force *)m->msg_iov,
-				    m->msg_iovlen, UIO_FASTIOV, iov, &res);
-	if (err >= 0)
-		m->msg_iov = res;
-	else if (res != iov)
-		kfree(res);
-	return err;
-}
-
 /*
  *	And now for the all-in-one: copy and checksum from a user iovec
  *	directly to a datagram
diff --git a/net/socket.c b/net/socket.c
index 59020f0b583b..ee3ee39eefa5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1988,16 +1988,26 @@ struct used_address {
 	unsigned int name_len;
 };
 
-static int copy_msghdr_from_user(struct msghdr *kmsg,
-				 struct user_msghdr __user *umsg)
+static ssize_t copy_msghdr_from_user(struct msghdr *kmsg,
+				     struct user_msghdr __user *umsg,
+				     struct sockaddr __user **save_addr,
+				     struct iovec **iov)
 {
-	/* We are relying on the (currently) identical layouts.  Once
-	 * the kernel-side changes, this place will need to be updated
-	 */
-	if (copy_from_user(kmsg, umsg, sizeof(struct msghdr)))
+	struct sockaddr __user *uaddr;
+	struct iovec __user *uiov;
+	ssize_t err;
+
+	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
+	    __get_user(uaddr, &umsg->msg_name) ||
+	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
+	    __get_user(uiov, &umsg->msg_iov) ||
+	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(kmsg->msg_control, &umsg->msg_control) ||
+	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
+	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
 		return -EFAULT;
 
-	if (kmsg->msg_name == NULL)
+	if (!uaddr)
 		kmsg->msg_namelen = 0;
 
 	if (kmsg->msg_namelen < 0)
@@ -2005,7 +2015,31 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 
 	if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
 		kmsg->msg_namelen = sizeof(struct sockaddr_storage);
-	return 0;
+
+	if (save_addr)
+		*save_addr = uaddr;
+
+	if (uaddr && kmsg->msg_namelen) {
+		if (!save_addr) {
+			err = move_addr_to_kernel(uaddr, kmsg->msg_namelen,
+						  kmsg->msg_name);
+			if (err < 0)
+				return err;
+		}
+	} else {
+		kmsg->msg_name = NULL;
+		kmsg->msg_namelen = 0;
+	}
+
+	if (kmsg->msg_iovlen > UIO_MAXIOV)
+		return -EMSGSIZE;
+
+	err = rw_copy_check_uvector(save_addr ? READ : WRITE,
+				    uiov, kmsg->msg_iovlen,
+				    UIO_FASTIOV, *iov, iov);
+	if (err >= 0)
+		kmsg->msg_iov = *iov;
+	return err;
 }
 
 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
@@ -2020,26 +2054,17 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 	    __attribute__ ((aligned(sizeof(__kernel_size_t))));
 	/* 20 is size of ipv6_pktinfo */
 	unsigned char *ctl_buf = ctl;
-	int err, ctl_len, total_len;
+	int ctl_len, total_len;
+	ssize_t err;
 
-	err = -EFAULT;
-	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(msg_sys, msg_compat))
-			return -EFAULT;
-	} else {
-		err = copy_msghdr_from_user(msg_sys, msg);
-		if (err)
-			return err;
-	}
+	msg_sys->msg_name = &address;
 
-	/* This will also move the address data into kernel space */
 	if (MSG_CMSG_COMPAT & flags)
-		err = verify_compat_iovec(msg_sys, iovstack, &address, WRITE);
+		err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
 	else
-		err = verify_iovec(msg_sys, iovstack, &address, WRITE);
+		err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
 	if (err < 0)
 		goto out_freeiov;
-	iov = msg_sys->msg_iov;
 	total_len = err;
 
 	err = -ENOBUFS;
@@ -2215,36 +2240,24 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	unsigned long cmsg_ptr;
-	int err, total_len, len;
+	int total_len, len;
+	ssize_t err;
 
 	/* kernel mode address */
 	struct sockaddr_storage addr;
 
 	/* user mode address pointers */
 	struct sockaddr __user *uaddr;
-	int __user *uaddr_len;
+	int __user *uaddr_len = COMPAT_NAMELEN(msg);
 
-	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(msg_sys, msg_compat))
-			return -EFAULT;
-	} else {
-		err = copy_msghdr_from_user(msg_sys, msg);
-		if (err)
-			return err;
-	}
+	msg_sys->msg_name = &addr;
 
-	/* Save the user-mode address (verify_iovec will change the
-	 * kernel msghdr to use the kernel address space)
-	 */
-	uaddr = (__force void __user *)msg_sys->msg_name;
-	uaddr_len = COMPAT_NAMELEN(msg);
 	if (MSG_CMSG_COMPAT & flags)
-		err = verify_compat_iovec(msg_sys, iovstack, &addr, READ);
+		err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
 	else
-		err = verify_iovec(msg_sys, iovstack, &addr, READ);
+		err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
 	if (err < 0)
 		goto out_freeiov;
-	iov = msg_sys->msg_iov;
 	total_len = err;
 
 	cmsg_ptr = (unsigned long)msg_sys->msg_control;
-- 
cgit v1.2.3-58-ga151


From 860f6e9eb780443381a76e3766a9698afbc5e2e5 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 19 Nov 2014 12:59:14 +0100
Subject: net: phy: add static data field to struct phy_driver

Add static driver-data field to struct phy_driver, which can be used to
store structured device-type information.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 07794e720139..22af8f8f5802 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -433,6 +433,7 @@ struct phy_device {
  *   by this PHY
  * flags: A bitfield defining certain other features this PHY
  *   supports (like interrupts)
+ * driver_data: static driver data
  *
  * The drivers must implement config_aneg and read_status.  All
  * other functions are optional. Note that none of these
@@ -448,6 +449,7 @@ struct phy_driver {
 	unsigned int phy_id_mask;
 	u32 features;
 	u32 flags;
+	const void *driver_data;
 
 	/*
 	 * Called to issue a PHY software reset
-- 
cgit v1.2.3-58-ga151


From 63f44b2bfccdd98193bbd602747f780c0fae0f02 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 19 Nov 2014 12:59:18 +0100
Subject: net: phy: micrel: add generic clock-mode-select support

Add generic RMII-Reference-Clock-Select support.

Several Micrel PHY have an RMII-Reference-Clock-Select bit to select
25 MHz or 50 MHz clock mode. Recently, support for configuring this
through device tree for KSZ8021 and KSZ8031 was added.

Generalise this support so that it can be configured for other PHY types
as well.

Note that some PHY revisions (of the same type) has this bit inverted.
This should be either configurable through a new device-tree property,
or preferably, determined based on PHY ID if possible.

Also note that this removes support for setting 25 MHz mode from board
files which was also added by the above mentioned commit 45f56cb82e45
("net/phy: micrel: Add clock support for KSZ8021/KSZ8031").

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c   | 93 +++++++++++++++++++++++++---------------------
 include/linux/micrel_phy.h |  1 -
 2 files changed, 50 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index a0e944099020..d2e790cd3651 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -57,7 +57,7 @@
 #define KSZPHY_CTRL_INT_ACTIVE_HIGH		BIT(9)
 #define KSZ9021_CTRL_INT_ACTIVE_HIGH		BIT(14)
 #define KS8737_CTRL_INT_ACTIVE_HIGH		BIT(14)
-#define KSZ8051_RMII_50MHZ_CLK			BIT(7)
+#define KSZPHY_RMII_REF_CLK_SEL			BIT(7)
 
 /* Write/read to/from extended registers */
 #define MII_KSZPHY_EXTREG                       0x0b
@@ -76,15 +76,19 @@
 struct kszphy_type {
 	u32 led_mode_reg;
 	bool has_broadcast_disable;
+	bool has_rmii_ref_clk_sel;
 };
 
 struct kszphy_priv {
 	const struct kszphy_type *type;
 	int led_mode;
+	bool rmii_ref_clk_sel;
+	bool rmii_ref_clk_sel_val;
 };
 
 static const struct kszphy_type ksz8021_type = {
 	.led_mode_reg		= MII_KSZPHY_CTRL_2,
+	.has_rmii_ref_clk_sel	= true,
 };
 
 static const struct kszphy_type ksz8041_type = {
@@ -100,21 +104,6 @@ static const struct kszphy_type ksz8081_type = {
 	.has_broadcast_disable	= true,
 };
 
-static int ksz_config_flags(struct phy_device *phydev)
-{
-	int regval;
-
-	if (phydev->dev_flags & (MICREL_PHY_50MHZ_CLK | MICREL_PHY_25MHZ_CLK)) {
-		regval = phy_read(phydev, MII_KSZPHY_CTRL);
-		if (phydev->dev_flags & MICREL_PHY_50MHZ_CLK)
-			regval |= KSZ8051_RMII_50MHZ_CLK;
-		else
-			regval &= ~KSZ8051_RMII_50MHZ_CLK;
-		return phy_write(phydev, MII_KSZPHY_CTRL, regval);
-	}
-	return 0;
-}
-
 static int kszphy_extended_write(struct phy_device *phydev,
 				u32 regnum, u16 val)
 {
@@ -189,6 +178,22 @@ static int ks8737_config_intr(struct phy_device *phydev)
 	return rc < 0 ? rc : 0;
 }
 
+static int kszphy_rmii_clk_sel(struct phy_device *phydev, bool val)
+{
+	int ctrl;
+
+	ctrl = phy_read(phydev, MII_KSZPHY_CTRL);
+	if (ctrl < 0)
+		return ctrl;
+
+	if (val)
+		ctrl |= KSZPHY_RMII_REF_CLK_SEL;
+	else
+		ctrl &= ~KSZPHY_RMII_REF_CLK_SEL;
+
+	return phy_write(phydev, MII_KSZPHY_CTRL, ctrl);
+}
+
 static int kszphy_setup_led(struct phy_device *phydev, u32 reg, int val)
 {
 	int rc, temp, shift;
@@ -243,6 +248,7 @@ static int kszphy_config_init(struct phy_device *phydev)
 {
 	struct kszphy_priv *priv = phydev->priv;
 	const struct kszphy_type *type;
+	int ret;
 
 	if (!priv)
 		return 0;
@@ -252,6 +258,14 @@ static int kszphy_config_init(struct phy_device *phydev)
 	if (type->has_broadcast_disable)
 		kszphy_broadcast_disable(phydev);
 
+	if (priv->rmii_ref_clk_sel) {
+		ret = kszphy_rmii_clk_sel(phydev, priv->rmii_ref_clk_sel_val);
+		if (ret) {
+			dev_err(&phydev->dev, "failed to set rmii reference clock\n");
+			return ret;
+		}
+	}
+
 	if (priv->led_mode >= 0)
 		kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
 
@@ -262,10 +276,8 @@ static int ksz8021_config_init(struct phy_device *phydev)
 {
 	int rc;
 
-	kszphy_config_init(phydev);
-
-	rc = ksz_config_flags(phydev);
-	if (rc < 0)
+	rc = kszphy_config_init(phydev);
+	if (rc)
 		return rc;
 
 	rc = kszphy_broadcast_disable(phydev);
@@ -273,16 +285,6 @@ static int ksz8021_config_init(struct phy_device *phydev)
 	return rc < 0 ? rc : 0;
 }
 
-static int ks8051_config_init(struct phy_device *phydev)
-{
-	int rc;
-
-	kszphy_config_init(phydev);
-
-	rc = ksz_config_flags(phydev);
-	return rc < 0 ? rc : 0;
-}
-
 static int ksz9021_load_values_from_of(struct phy_device *phydev,
 				       struct device_node *of_node, u16 reg,
 				       char *field1, char *field2,
@@ -517,6 +519,7 @@ static int kszphy_probe(struct phy_device *phydev)
 	const struct kszphy_type *type = phydev->drv->driver_data;
 	struct device_node *np = phydev->dev.of_node;
 	struct kszphy_priv *priv;
+	struct clk *clk;
 	int ret;
 
 	priv = devm_kzalloc(&phydev->dev, sizeof(*priv), GFP_KERNEL);
@@ -542,28 +545,32 @@ static int kszphy_probe(struct phy_device *phydev)
 		priv->led_mode = -1;
 	}
 
-	return 0;
-}
-
-static int ksz8021_probe(struct phy_device *phydev)
-{
-	struct clk *clk;
-
 	clk = devm_clk_get(&phydev->dev, "rmii-ref");
 	if (!IS_ERR(clk)) {
 		unsigned long rate = clk_get_rate(clk);
 
+		priv->rmii_ref_clk_sel = type->has_rmii_ref_clk_sel;
+
+		/* FIXME: add support for PHY revisions that have this bit
+		 * inverted (e.g. through new property or based on PHY ID).
+		 */
 		if (rate > 24500000 && rate < 25500000) {
-			phydev->dev_flags |= MICREL_PHY_25MHZ_CLK;
+			priv->rmii_ref_clk_sel_val = false;
 		} else if (rate > 49500000 && rate < 50500000) {
-			phydev->dev_flags |= MICREL_PHY_50MHZ_CLK;
+			priv->rmii_ref_clk_sel_val = true;
 		} else {
 			dev_err(&phydev->dev, "Clock rate out of range: %ld\n", rate);
 			return -EINVAL;
 		}
 	}
 
-	return kszphy_probe(phydev);
+	/* Support legacy board-file configuration */
+	if (phydev->dev_flags & MICREL_PHY_50MHZ_CLK) {
+		priv->rmii_ref_clk_sel = true;
+		priv->rmii_ref_clk_sel_val = true;
+	}
+
+	return 0;
 }
 
 static struct phy_driver ksphy_driver[] = {
@@ -589,7 +596,7 @@ static struct phy_driver ksphy_driver[] = {
 			   SUPPORTED_Asym_Pause),
 	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
 	.driver_data	= &ksz8021_type,
-	.probe		= ksz8021_probe,
+	.probe		= kszphy_probe,
 	.config_init	= ksz8021_config_init,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
@@ -606,7 +613,7 @@ static struct phy_driver ksphy_driver[] = {
 			   SUPPORTED_Asym_Pause),
 	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
 	.driver_data	= &ksz8021_type,
-	.probe		= ksz8021_probe,
+	.probe		= kszphy_probe,
 	.config_init	= ksz8021_config_init,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
@@ -658,7 +665,7 @@ static struct phy_driver ksphy_driver[] = {
 	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
 	.driver_data	= &ksz8051_type,
 	.probe		= kszphy_probe,
-	.config_init	= ks8051_config_init,
+	.config_init	= kszphy_config_init,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
 	.ack_interrupt	= kszphy_ack_interrupt,
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 53d33dee70e1..2e5b194b9b19 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -37,7 +37,6 @@
 
 /* struct phy_device dev_flags definitions */
 #define MICREL_PHY_50MHZ_CLK	0x00000001
-#define MICREL_PHY_25MHZ_CLK	0x00000002
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3-58-ga151


From b960a0ac6939ef4962c5abbf33e80d1382b45fc1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:56 +0100
Subject: vlan: make __vlan_hwaccel_put_tag return void

Always returns the same skb it gets, so change to void.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/fcoe/fcoe.c | 6 ++----
 include/linux/if_vlan.h  | 9 ++++-----
 net/8021q/vlan_dev.c     | 2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 4a8ac7d8c76b..73a8cc485f87 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1669,10 +1669,8 @@ static int fcoe_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	    fcoe->realdev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		/* must set skb->dev before calling vlan_put_tag */
 		skb->dev = fcoe->realdev;
-		skb = __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
-					     vlan_dev_vlan_id(fcoe->netdev));
-		if (!skb)
-			return -ENOMEM;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       vlan_dev_vlan_id(fcoe->netdev));
 	} else
 		skb->dev = fcoe->netdev;
 
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index d69f0577a319..1b5dbc2711f9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -347,13 +347,11 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb,
  *
  * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
  */
-static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
-						     __be16 vlan_proto,
-						     u16 vlan_tci)
+static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
+					  __be16 vlan_proto, u16 vlan_tci)
 {
 	skb->vlan_proto = vlan_proto;
 	skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci;
-	return skb;
 }
 
 /**
@@ -368,7 +366,8 @@ static inline struct sk_buff *vlan_put_tag(struct sk_buff *skb,
 					   __be16 vlan_proto, u16 vlan_tci)
 {
 	if (vlan_hw_offload_capable(skb->dev->features, vlan_proto)) {
-		return __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+		__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+		return skb;
 	} else {
 		return __vlan_put_tag(skb, vlan_proto, vlan_tci);
 	}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 0d441ec8763e..d6524b2b206a 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -150,7 +150,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 		u16 vlan_tci;
 		vlan_tci = vlan->vlan_id;
 		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
-		skb = __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci);
+		__vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci);
 	}
 
 	skb->dev = vlan->real_dev;
-- 
cgit v1.2.3-58-ga151


From b4bef1b57544b18899eb15569e3bafd8d2eeeff6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:57 +0100
Subject: vlan: kill vlan_put_tag helper

Since both tx and rx paths work with skb->vlan_tci, there's no need for
this function anymore. Switch users directly to __vlan_hwaccel_put_tag.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c  | 17 ++++-------------
 drivers/net/bonding/bond_main.c |  8 ++------
 drivers/net/usb/cdc_mbim.c      |  2 +-
 include/linux/if_vlan.h         | 19 -------------------
 4 files changed, 7 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index e1f1a006af85..bb9e9fc45e1b 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -475,12 +475,8 @@ static void rlb_update_client(struct rlb_client_info *client_info)
 		skb->dev = client_info->slave->dev;
 
 		if (client_info->vlan_id) {
-			skb = vlan_put_tag(skb, htons(ETH_P_8021Q), client_info->vlan_id);
-			if (!skb) {
-				netdev_err(client_info->slave->bond->dev,
-					   "failed to insert VLAN tag\n");
-				continue;
-			}
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+					       client_info->vlan_id);
 		}
 
 		arp_xmit(skb);
@@ -951,13 +947,8 @@ static void alb_send_lp_vid(struct slave *slave, u8 mac_addr[],
 	skb->priority = TC_PRIO_CONTROL;
 	skb->dev = slave->dev;
 
-	if (vid) {
-		skb = vlan_put_tag(skb, vlan_proto, vid);
-		if (!skb) {
-			netdev_err(slave->bond->dev, "failed to insert VLAN tag\n");
-			return;
-		}
-	}
+	if (vid)
+		__vlan_hwaccel_put_tag(skb, vlan_proto, vid);
 
 	dev_queue_xmit(skb);
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8575fee8b359..e26c68232032 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2159,12 +2159,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 	if (outer_tag->vlan_id) {
 		netdev_dbg(slave_dev, "outer tag: proto %X vid %X\n",
 			   ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
-		skb = vlan_put_tag(skb, outer_tag->vlan_proto,
-				   outer_tag->vlan_id);
-		if (!skb) {
-			net_err_ratelimited("failed to insert outer VLAN tag\n");
-			return;
-		}
+		__vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
+				       outer_tag->vlan_id);
 	}
 
 xmit:
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 5ee7a1dbc023..96fc8a5bde84 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -402,7 +402,7 @@ static struct sk_buff *cdc_mbim_process_dgram(struct usbnet *dev, u8 *buf, size_
 
 	/* map MBIM session to VLAN */
 	if (tci)
-		vlan_put_tag(skb, htons(ETH_P_8021Q), tci);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tci);
 err:
 	return skb;
 }
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 1b5dbc2711f9..75b70a5e4a6d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -354,25 +354,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
 	skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci;
 }
 
-/**
- * vlan_put_tag - inserts VLAN tag according to device features
- * @skb: skbuff to tag
- * @vlan_tci: VLAN TCI to insert
- *
- * Assumes skb->dev is the target that will xmit this frame.
- * Returns a VLAN tagged skb.
- */
-static inline struct sk_buff *vlan_put_tag(struct sk_buff *skb,
-					   __be16 vlan_proto, u16 vlan_tci)
-{
-	if (vlan_hw_offload_capable(skb->dev->features, vlan_proto)) {
-		__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
-		return skb;
-	} else {
-		return __vlan_put_tag(skb, vlan_proto, vlan_tci);
-	}
-}
-
 /**
  * __vlan_get_tag - get the VLAN ID that is part of the payload
  * @skb: skbuff to query
-- 
cgit v1.2.3-58-ga151


From 62749e2cb3c4a7da3eaa5c01a7e787aebeff8536 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:58 +0100
Subject: vlan: rename __vlan_put_tag to vlan_insert_tag_set_proto

Name fits better. Plus there's going to be introduced
__vlan_insert_tag later on.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c |  6 ++++--
 drivers/net/vxlan.c                         | 12 ++++++------
 include/linux/if_vlan.h                     |  8 +++++---
 net/bridge/br_vlan.c                        |  4 ++--
 net/core/dev.c                              |  4 ++--
 net/core/netpoll.c                          |  4 ++--
 net/ipv4/geneve.c                           | 11 +++++------
 net/openvswitch/actions.c                   |  4 +++-
 net/openvswitch/datapath.c                  |  3 ++-
 net/openvswitch/vport-gre.c                 |  6 +++---
 11 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e26c68232032..c1d7da427a3e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2146,8 +2146,8 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op,
 
 		netdev_dbg(slave_dev, "inner tag: proto %X vid %X\n",
 			   ntohs(outer_tag->vlan_proto), tags->vlan_id);
-		skb = __vlan_put_tag(skb, tags->vlan_proto,
-				     tags->vlan_id);
+		skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
+						tags->vlan_id);
 		if (!skb) {
 			net_err_ratelimited("failed to insert inner VLAN tag\n");
 			return;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 54160cc62656..d02fbc7694ea 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -887,7 +887,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	}
 
 	if (vlan_tag) {
-		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
 		if (unlikely(!skb))
 			return skb;
 		skb->vlan_tci = 0;
@@ -896,7 +897,8 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	/* Insert the outer VLAN, if any */
 	if (adapter->qnq_vid) {
 		vlan_tag = adapter->qnq_vid;
-		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
 		if (unlikely(!skb))
 			return skb;
 		if (skip_hw_vlan)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 23b1e8c0d547..bb8fbab438e8 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1600,9 +1600,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (WARN_ON(!__vlan_put_tag(skb,
-					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (WARN_ON(!skb))
 			return -ENOMEM;
 
 		skb->vlan_tci = 0;
@@ -1644,9 +1644,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (WARN_ON(!__vlan_put_tag(skb,
-					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (WARN_ON(!skb))
 			return -ENOMEM;
 
 		skb->vlan_tci = 0;
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 75b70a5e4a6d..46e4a15b9b55 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -320,8 +320,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 }
 
 /**
- * __vlan_put_tag - regular VLAN tag inserting
+ * vlan_insert_tag_set_proto - regular VLAN tag inserting
  * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
@@ -330,8 +331,9 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
  * Following the skb_unshare() example, in case of error, the calling function
  * doesn't have to worry about freeing the original skb.
  */
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb,
-					     __be16 vlan_proto, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
+							__be16 vlan_proto,
+							u16 vlan_tci)
 {
 	skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
 	if (skb)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 150048fb99b0..97b8ddf57363 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -199,8 +199,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 		if (skb->vlan_proto != proto) {
 			/* Protocol-mismatch, empty out vlan_tci for new tag */
 			skb_push(skb, ETH_HLEN);
-			skb = __vlan_put_tag(skb, skb->vlan_proto,
-					     vlan_tx_tag_get(skb));
+			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+							vlan_tx_tag_get(skb));
 			if (unlikely(!skb))
 				return false;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ab168e0fdf7..3611e60df407 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2645,8 +2645,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 {
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = __vlan_put_tag(skb, skb->vlan_proto,
-				     vlan_tx_tag_get(skb));
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
 		if (skb)
 			skb->vlan_tci = 0;
 	}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e6645b4f330a..65d372384a3f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,8 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = __vlan_put_tag(skb, skb->vlan_proto,
-				     vlan_tx_tag_get(skb));
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
 		if (unlikely(!skb)) {
 			/* This is actually a packet drop, but we
 			 * don't want the code that calls this
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 31802afce34f..fd430a6a1c37 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -132,12 +132,11 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 		return err;
 
 	if (vlan_tx_tag_present(skb)) {
-		if (unlikely(!__vlan_put_tag(skb,
-					     skb->vlan_proto,
-					     vlan_tx_tag_get(skb)))) {
-			err = -ENOMEM;
-			return err;
-		}
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (unlikely(!skb)
+			return -ENOMEM;
+
 		skb->vlan_tci = 0;
 	}
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 749a30163071..426b9131ede0 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -287,7 +287,9 @@ static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		/* push down current VLAN tag */
 		current_tag = vlan_tx_tag_get(skb);
 
-		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						current_tag);
+		if (!skb)
 			return -ENOMEM;
 		/* Update mac_len for subsequent MPLS actions */
 		skb->mac_len += VLAN_HLEN;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ab141d49bb9d..c63e60e4d947 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -425,7 +425,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
+		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto,
+						 vlan_tx_tag_get(nskb));
 		if (!nskb)
 			return -ENOMEM;
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 8e61a5c6ae7c..777cd8c71d53 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -176,9 +176,9 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	if (vlan_tx_tag_present(skb)) {
-		if (unlikely(!__vlan_put_tag(skb,
-					     skb->vlan_proto,
-					     vlan_tx_tag_get(skb)))) {
+		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+						vlan_tx_tag_get(skb));
+		if (unlikely(!skb) {
 			err = -ENOMEM;
 			goto err_free_rt;
 		}
-- 
cgit v1.2.3-58-ga151


From 5968250c868ceee680aa77395b24e6ddcae17d36 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:04:59 +0100
Subject: vlan: introduce *vlan_hwaccel_push_inside helpers

Use them to push skb->vlan_tci into the payload and avoid code
duplication.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 22 ++++++----------------
 include/linux/if_vlan.h     | 34 ++++++++++++++++++++++++++++++++++
 net/core/dev.c              |  8 ++------
 net/core/netpoll.c          |  4 +---
 net/ipv4/geneve.c           | 11 +++--------
 net/openvswitch/datapath.c  |  4 +---
 net/openvswitch/vport-gre.c | 12 ++++--------
 7 files changed, 51 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bb8fbab438e8..64d45fa3d997 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1599,14 +1599,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (WARN_ON(!skb))
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
@@ -1643,14 +1638,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (WARN_ON(!skb))
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 46e4a15b9b55..291e6706876e 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -341,6 +341,40 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 	return skb;
 }
 
+/*
+ * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
+ * @skb: skbuff to tag
+ *
+ * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ */
+static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
+{
+	skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+					vlan_tx_tag_get(skb));
+	if (likely(skb))
+		skb->vlan_tci = 0;
+	return skb;
+}
+/*
+ * vlan_hwaccel_push_inside - pushes vlan tag to the payload
+ * @skb: skbuff to tag
+ *
+ * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the
+ * VLAN tag from @skb->vlan_tci inside to the payload.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ */
+static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
+{
+	if (vlan_tx_tag_present(skb))
+		skb = __vlan_hwaccel_push_inside(skb);
+	return skb;
+}
+
 /**
  * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
  * @skb: skbuff to tag
diff --git a/net/core/dev.c b/net/core/dev.c
index 3611e60df407..ac4836241a96 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2644,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 					  netdev_features_t features)
 {
 	if (vlan_tx_tag_present(skb) &&
-	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (skb)
-			skb->vlan_tci = 0;
-	}
+	    !vlan_hw_offload_capable(features, skb->vlan_proto))
+		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 65d372384a3f..e0ad5d16c9c5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	if (vlan_tx_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
+		skb = __vlan_hwaccel_push_inside(skb);
 		if (unlikely(!skb)) {
 			/* This is actually a packet drop, but we
 			 * don't want the code that calls this
@@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			 */
 			goto out;
 		}
-		skb->vlan_tci = 0;
 	}
 
 	status = netdev_start_xmit(skb, dev, txq, false);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index fd430a6a1c37..a457232f0131 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -131,14 +131,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 	if (unlikely(err))
 		return err;
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (unlikely(!skb)
-			return -ENOMEM;
-
-		skb->vlan_tci = 0;
-	}
+	skb = vlan_hwaccel_push_inside(skb);
+	if (unlikely(!skb))
+		return -ENOMEM;
 
 	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
 	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c63e60e4d947..f37ca3e5824c 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -425,12 +425,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto,
-						 vlan_tx_tag_get(nskb));
+		nskb = __vlan_hwaccel_push_inside(nskb);
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb->vlan_tci = 0;
 		skb = nskb;
 	}
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 777cd8c71d53..6b69df545b1d 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -175,14 +175,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 			goto err_free_rt;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						vlan_tx_tag_get(skb));
-		if (unlikely(!skb) {
-			err = -ENOMEM;
-			goto err_free_rt;
-		}
-		skb->vlan_tci = 0;
+	skb = vlan_hwaccel_push_inside(skb);
+	if (unlikely(!skb)) {
+		err = -ENOMEM;
+		goto err_free_rt;
 	}
 
 	/* Push Tunnel header. */
-- 
cgit v1.2.3-58-ga151


From 15255a43e6c917813800702e100267046e240cc0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:05:00 +0100
Subject: vlan: introduce __vlan_insert_tag helper which does not free skb

There's a need for helper which inserts vlan tag but does not free the
skb in case of an error.

Suggested-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 45 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 291e6706876e..515a35e2a48a 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -282,28 +282,24 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
 }
 
 /**
- * vlan_insert_tag - regular VLAN tag inserting
+ * __vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
  * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
- * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
- *
- * Following the skb_unshare() example, in case of error, the calling function
- * doesn't have to worry about freeing the original skb.
+ * Returns error if skb_cow_head failes.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
-					      __be16 vlan_proto, u16 vlan_tci)
+static inline int __vlan_insert_tag(struct sk_buff *skb,
+				    __be16 vlan_proto, u16 vlan_tci)
 {
 	struct vlan_ethhdr *veth;
 
-	if (skb_cow_head(skb, VLAN_HLEN) < 0) {
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
+	if (skb_cow_head(skb, VLAN_HLEN) < 0)
+		return -ENOMEM;
+
 	veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN);
 
 	/* Move the mac addresses to the beginning of the new header. */
@@ -316,6 +312,33 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
 	/* now, the TCI */
 	veth->h_vlan_TCI = htons(vlan_tci);
 
+	return 0;
+}
+
+/**
+ * vlan_insert_tag - regular VLAN tag inserting
+ * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
+ * @vlan_tci: VLAN TCI to insert
+ *
+ * Inserts the VLAN tag into @skb as part of the payload
+ * Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
+ *
+ * Following the skb_unshare() example, in case of error, the calling function
+ * doesn't have to worry about freeing the original skb.
+ *
+ * Does not change skb->protocol so this function can be used during receive.
+ */
+static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
+					      __be16 vlan_proto, u16 vlan_tci)
+{
+	int err;
+
+	err = __vlan_insert_tag(skb, vlan_proto, vlan_tci);
+	if (err) {
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
 	return skb;
 }
 
-- 
cgit v1.2.3-58-ga151


From e21951212f03b8d805795d8f71206853b2ab344d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:05:01 +0100
Subject: net: move make_writable helper into common code

note that skb_make_writable already exists in net/netfilter/core.c
but does something slightly different.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  1 +
 net/core/skbuff.c         | 12 ++++++++++++
 net/openvswitch/actions.c | 39 ++++++++++++++-------------------------
 3 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73c370e615de..e045516891a9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2678,6 +2678,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
+int skb_ensure_writable(struct sk_buff *skb, int write_len);
 
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 700189604f3d..d11bbe0da355 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4151,6 +4151,18 @@ err_free:
 }
 EXPORT_SYMBOL(skb_vlan_untag);
 
+int skb_ensure_writable(struct sk_buff *skb, int write_len)
+{
+	if (!pskb_may_pull(skb, write_len))
+		return -ENOMEM;
+
+	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+		return 0;
+
+	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 426b9131ede0..7ffa37716265 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -119,17 +119,6 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
 	return !!key->eth.type;
 }
 
-static int make_writable(struct sk_buff *skb, int write_len)
-{
-	if (!pskb_may_pull(skb, write_len))
-		return -ENOMEM;
-
-	if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
-		return 0;
-
-	return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
-}
-
 static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_mpls *mpls)
 {
@@ -171,7 +160,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	struct ethhdr *hdr;
 	int err;
 
-	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
 	if (unlikely(err))
 		return err;
 
@@ -201,7 +190,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	__be32 *stack;
 	int err;
 
-	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
 	if (unlikely(err))
 		return err;
 
@@ -223,7 +212,7 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 	struct vlan_hdr *vhdr;
 	int err;
 
-	err = make_writable(skb, VLAN_ETH_HLEN);
+	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
 	if (unlikely(err))
 		return err;
 
@@ -310,7 +299,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *key,
 			const struct ovs_key_ethernet *eth_key)
 {
 	int err;
-	err = make_writable(skb, ETH_HLEN);
+	err = skb_ensure_writable(skb, ETH_HLEN);
 	if (unlikely(err))
 		return err;
 
@@ -412,8 +401,8 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *key,
 	struct iphdr *nh;
 	int err;
 
-	err = make_writable(skb, skb_network_offset(skb) +
-				 sizeof(struct iphdr));
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				  sizeof(struct iphdr));
 	if (unlikely(err))
 		return err;
 
@@ -450,8 +439,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *key,
 	__be32 *saddr;
 	__be32 *daddr;
 
-	err = make_writable(skb, skb_network_offset(skb) +
-			    sizeof(struct ipv6hdr));
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				  sizeof(struct ipv6hdr));
 	if (unlikely(err))
 		return err;
 
@@ -493,7 +482,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
-/* Must follow make_writable() since that can move the skb data. */
+/* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			 __be16 new_port, __sum16 *check)
 {
@@ -523,8 +512,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *key,
 	struct udphdr *uh;
 	int err;
 
-	err = make_writable(skb, skb_transport_offset(skb) +
-				 sizeof(struct udphdr));
+	err = skb_ensure_writable(skb, skb_transport_offset(skb) +
+				  sizeof(struct udphdr));
 	if (unlikely(err))
 		return err;
 
@@ -548,8 +537,8 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *key,
 	struct tcphdr *th;
 	int err;
 
-	err = make_writable(skb, skb_transport_offset(skb) +
-				 sizeof(struct tcphdr));
+	err = skb_ensure_writable(skb, skb_transport_offset(skb) +
+				  sizeof(struct tcphdr));
 	if (unlikely(err))
 		return err;
 
@@ -574,7 +563,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *key,
 	int err;
 	unsigned int sctphoff = skb_transport_offset(skb);
 
-	err = make_writable(skb, sctphoff + sizeof(struct sctphdr));
+	err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr));
 	if (unlikely(err))
 		return err;
 
-- 
cgit v1.2.3-58-ga151


From 93515d53b133d66f01aec7b231fa3e40e3d2fd9a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 19 Nov 2014 14:05:02 +0100
Subject: net: move vlan pop/push functions into common code

So it can be used from out of openvswitch code.
Did couple of cosmetic changes on the way, namely variable naming and
adding support for 8021AD proto.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  2 +
 net/core/skbuff.c         | 95 +++++++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/actions.c | 86 +++++-------------------------------------
 3 files changed, 106 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e045516891a9..78c299f40bac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2679,6 +2679,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
+int skb_vlan_pop(struct sk_buff *skb);
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d11bbe0da355..c906c5f4bf69 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4163,6 +4163,101 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
 }
 EXPORT_SYMBOL(skb_ensure_writable);
 
+/* remove VLAN header from packet and update csum accordingly. */
+static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+	struct vlan_hdr *vhdr;
+	unsigned int offset = skb->data - skb_mac_header(skb);
+	int err;
+
+	__skb_push(skb, offset);
+	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+	if (unlikely(err))
+		goto pull;
+
+	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+	__skb_pull(skb, VLAN_HLEN);
+
+	vlan_set_encap_proto(skb, vhdr);
+	skb->mac_header += VLAN_HLEN;
+
+	if (skb_network_offset(skb) < ETH_HLEN)
+		skb_set_network_header(skb, ETH_HLEN);
+
+	skb_reset_mac_len(skb);
+pull:
+	__skb_pull(skb, offset);
+
+	return err;
+}
+
+int skb_vlan_pop(struct sk_buff *skb)
+{
+	u16 vlan_tci;
+	__be16 vlan_proto;
+	int err;
+
+	if (likely(vlan_tx_tag_present(skb))) {
+		skb->vlan_tci = 0;
+	} else {
+		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
+			      skb->protocol != htons(ETH_P_8021AD)) ||
+			     skb->len < VLAN_ETH_HLEN))
+			return 0;
+
+		err = __skb_vlan_pop(skb, &vlan_tci);
+		if (err)
+			return err;
+	}
+	/* move next vlan tag to hw accel tag */
+	if (likely((skb->protocol != htons(ETH_P_8021Q) &&
+		    skb->protocol != htons(ETH_P_8021AD)) ||
+		   skb->len < VLAN_ETH_HLEN))
+		return 0;
+
+	vlan_proto = skb->protocol;
+	err = __skb_vlan_pop(skb, &vlan_tci);
+	if (unlikely(err))
+		return err;
+
+	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+	return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+	if (vlan_tx_tag_present(skb)) {
+		unsigned int offset = skb->data - skb_mac_header(skb);
+		int err;
+
+		/* __vlan_insert_tag expect skb->data pointing to mac header.
+		 * So change skb->data before calling it and change back to
+		 * original position later
+		 */
+		__skb_push(skb, offset);
+		err = __vlan_insert_tag(skb, skb->vlan_proto,
+					vlan_tx_tag_get(skb));
+		if (err)
+			return err;
+		skb->protocol = skb->vlan_proto;
+		skb->mac_len += VLAN_HLEN;
+		__skb_pull(skb, offset);
+
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->csum = csum_add(skb->csum, csum_partial(skb->data
+					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
+	}
+	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+	return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
 /**
  * alloc_skb_with_frags - allocate skb with page frags
  *
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 7ffa37716265..4e05ea1c2d11 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -206,93 +206,27 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
-/* remove VLAN header from packet and update csum accordingly. */
-static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
-{
-	struct vlan_hdr *vhdr;
-	int err;
-
-	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
-	if (unlikely(err))
-		return err;
-
-	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
-
-	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
-	*current_tci = vhdr->h_vlan_TCI;
-
-	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
-	__skb_pull(skb, VLAN_HLEN);
-
-	vlan_set_encap_proto(skb, vhdr);
-	skb->mac_header += VLAN_HLEN;
-
-	if (skb_network_offset(skb) < ETH_HLEN)
-		skb_set_network_header(skb, ETH_HLEN);
-
-	/* Update mac_len for subsequent MPLS actions */
-	skb_reset_mac_len(skb);
-	return 0;
-}
-
 static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 {
-	__be16 tci;
 	int err;
 
-	if (likely(vlan_tx_tag_present(skb))) {
-		skb->vlan_tci = 0;
-	} else {
-		if (unlikely(skb->protocol != htons(ETH_P_8021Q) ||
-			     skb->len < VLAN_ETH_HLEN))
-			return 0;
-
-		err = __pop_vlan_tci(skb, &tci);
-		if (err)
-			return err;
-	}
-	/* move next vlan tag to hw accel tag */
-	if (likely(skb->protocol != htons(ETH_P_8021Q) ||
-		   skb->len < VLAN_ETH_HLEN)) {
+	err = skb_vlan_pop(skb);
+	if (vlan_tx_tag_present(skb))
+		invalidate_flow_key(key);
+	else
 		key->eth.tci = 0;
-		return 0;
-	}
-
-	invalidate_flow_key(key);
-	err = __pop_vlan_tci(skb, &tci);
-	if (unlikely(err))
-		return err;
-
-	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci));
-	return 0;
+	return err;
 }
 
 static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 current_tag;
-
-		/* push down current VLAN tag */
-		current_tag = vlan_tx_tag_get(skb);
-
-		skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-						current_tag);
-		if (!skb)
-			return -ENOMEM;
-		/* Update mac_len for subsequent MPLS actions */
-		skb->mac_len += VLAN_HLEN;
-
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->csum = csum_add(skb->csum, csum_partial(skb->data
-					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
-
+	if (vlan_tx_tag_present(skb))
 		invalidate_flow_key(key);
-	} else {
+	else
 		key->eth.tci = vlan->vlan_tci;
-	}
-	__vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
-	return 0;
+	return skb_vlan_push(skb, vlan->vlan_tpid,
+			     ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
 }
 
 static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *key,
@@ -858,8 +792,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 			err = push_vlan(skb, key, nla_data(a));
-			if (unlikely(err)) /* skb already freed. */
-				return err;
 			break;
 
 		case OVS_ACTION_ATTR_POP_VLAN:
-- 
cgit v1.2.3-58-ga151


From 479163f4608214d18bc3266ab6e4b578897a3052 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Thu, 20 Nov 2014 08:13:57 +0000
Subject: mlx5: don't duplicate kvfree()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/cq.c                     | 8 ++++----
 drivers/infiniband/hw/mlx5/mr.c                     | 4 ++--
 drivers/infiniband/hw/mlx5/qp.c                     | 8 ++++----
 drivers/infiniband/hw/mlx5/srq.c                    | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c        | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/port.c      | 4 ++--
 include/linux/mlx5/driver.h                         | 8 --------
 8 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 10cfce5119a9..c463e7bba5f4 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -805,14 +805,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
 		}
 
 
-	mlx5_vfree(cqb);
+	kvfree(cqb);
 	return &cq->ibcq;
 
 err_cmd:
 	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
 
 err_cqb:
-	mlx5_vfree(cqb);
+	kvfree(cqb);
 	if (context)
 		destroy_cq_user(cq, context);
 	else
@@ -1159,11 +1159,11 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	}
 	mutex_unlock(&cq->resize_mutex);
 
-	mlx5_vfree(in);
+	kvfree(in);
 	return 0;
 
 ex_alloc:
-	mlx5_vfree(in);
+	kvfree(in);
 
 ex_resize:
 	if (udata)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8ee7cb46e059..4c89b64aa9cf 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -853,14 +853,14 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
 		goto err_2;
 	}
 	mr->umem = umem;
-	mlx5_vfree(in);
+	kvfree(in);
 
 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
 
 	return mr;
 
 err_2:
-	mlx5_vfree(in);
+	kvfree(in);
 
 err_1:
 	kfree(mr);
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index e261a53f9a02..0e2ef9fe0e29 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -647,7 +647,7 @@ err_unmap:
 	mlx5_ib_db_unmap_user(context, &qp->db);
 
 err_free:
-	mlx5_vfree(*in);
+	kvfree(*in);
 
 err_umem:
 	if (qp->umem)
@@ -761,7 +761,7 @@ err_wrid:
 	kfree(qp->rq.wrid);
 
 err_free:
-	mlx5_vfree(*in);
+	kvfree(*in);
 
 err_buf:
 	mlx5_buf_free(dev->mdev, &qp->buf);
@@ -971,7 +971,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		goto err_create;
 	}
 
-	mlx5_vfree(in);
+	kvfree(in);
 	/* Hardware wants QPN written in big-endian order (after
 	 * shifting) for send doorbell.  Precompute this value to save
 	 * a little bit when posting sends.
@@ -988,7 +988,7 @@ err_create:
 	else if (qp->create_type == MLX5_QP_KERNEL)
 		destroy_qp_kernel(dev, qp);
 
-	mlx5_vfree(in);
+	kvfree(in);
 	return err;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 97cc1baaa8e3..41fec66217dd 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -141,7 +141,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 	return 0;
 
 err_in:
-	mlx5_vfree(*in);
+	kvfree(*in);
 
 err_umem:
 	ib_umem_release(srq->umem);
@@ -209,7 +209,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 	return 0;
 
 err_in:
-	mlx5_vfree(*in);
+	kvfree(*in);
 
 err_buf:
 	mlx5_buf_free(dev->mdev, &srq->buf);
@@ -306,7 +306,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 	in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
 	in->ctx.db_record = cpu_to_be64(srq->db.dma);
 	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen);
-	mlx5_vfree(in);
+	kvfree(in);
 	if (err) {
 		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
 		goto err_usr_kern_srq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ad2c96a02a53..dfd3ad0a39c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -390,7 +390,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	 */
 	eq_update_ci(eq, 1);
 
-	mlx5_vfree(in);
+	kvfree(in);
 	return 0;
 
 err_irq:
@@ -400,7 +400,7 @@ err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
 
 err_in:
-	mlx5_vfree(in);
+	kvfree(in);
 
 err_buf:
 	mlx5_buf_free(dev, &eq->buf);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index d476918ef269..4fdaae9b54d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -349,7 +349,7 @@ out_4k:
 	for (i--; i >= 0; i--)
 		free_4k(dev, be64_to_cpu(in->pas[i]));
 out_free:
-	mlx5_vfree(in);
+	kvfree(in);
 	return err;
 }
 
@@ -400,7 +400,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 	}
 
 out_free:
-	mlx5_vfree(out);
+	kvfree(out);
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 313965853e10..72c2d002c3b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -68,9 +68,9 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
 		memcpy(data_out, out->data, size_out);
 
 ex2:
-	mlx5_vfree(out);
+	kvfree(out);
 ex1:
-	mlx5_vfree(in);
+	kvfree(in);
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_core_access_reg);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 246310dc8bef..b1bf41556b32 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -633,14 +633,6 @@ static inline void *mlx5_vzalloc(unsigned long size)
 	return rtn;
 }
 
-static inline void mlx5_vfree(const void *addr)
-{
-	if (addr && is_vmalloc_addr(addr))
-		vfree(addr);
-	else
-		kfree(addr);
-}
-
 static inline u32 mlx5_base_mkey(const u32 key)
 {
 	return key & 0xffffff00u;
-- 
cgit v1.2.3-58-ga151


From 227158db160449b6513d2e31894a135104b90e90 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Apr 2014 18:47:38 -0400
Subject: new helper: skb_copy_and_csum_datagram_msg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h | 5 +++++
 net/ipv4/udp.c         | 5 ++---
 net/ipv6/raw.c         | 2 +-
 net/ipv6/udp.c         | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 78c299f40bac..cbe4b2078b30 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2651,6 +2651,11 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 }
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
+static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
+			    struct msghdr *msg)
+{
+	return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov);
+}
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 const struct iovec *from, int from_offset,
 				 int len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a16b9129079..b2d606833ce4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1284,9 +1284,8 @@ try_again:
 		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
 					    msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_iovec(skb,
-						       sizeof(struct udphdr),
-						       msg->msg_iov);
+		err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
+						     msg);
 
 		if (err == -EINVAL)
 			goto csum_copy_err;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0cbcf98f2cab..8baa53e17a30 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -492,7 +492,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 			goto csum_copy_err;
 		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else {
-		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
+		err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index dbc0b042bdd6..7cfb5d745a2d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -428,7 +428,7 @@ try_again:
 		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
 					    msg, copied);
 	else {
-		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+		err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
 		if (err == -EINVAL)
 			goto csum_copy_err;
 	}
-- 
cgit v1.2.3-58-ga151


From 6ce8e9ce5989ae13f493062975304700be86d20e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Apr 2014 21:25:44 -0400
Subject: new helper: memcpy_from_msg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/algif_skcipher.c     | 10 +++++-----
 drivers/isdn/mISDN/socket.c |  2 +-
 drivers/net/ppp/pppoe.c     |  2 +-
 include/linux/skbuff.h      |  5 +++++
 include/net/sctp/sm.h       |  2 +-
 net/appletalk/ddp.c         |  2 +-
 net/ax25/af_ax25.c          |  2 +-
 net/bluetooth/hci_sock.c    |  2 +-
 net/bluetooth/mgmt.c        |  2 +-
 net/bluetooth/rfcomm/sock.c |  2 +-
 net/bluetooth/sco.c         |  2 +-
 net/caif/caif_socket.c      |  4 ++--
 net/can/bcm.c               | 19 ++++++++-----------
 net/can/raw.c               |  2 +-
 net/dccp/proto.c            |  2 +-
 net/decnet/af_decnet.c      |  2 +-
 net/ieee802154/dgram.c      |  2 +-
 net/ieee802154/raw.c        |  2 +-
 net/ipv4/ping.c             |  2 +-
 net/ipv4/tcp_input.c        |  2 +-
 net/irda/af_irda.c          |  6 +++---
 net/iucv/af_iucv.c          |  2 +-
 net/key/af_key.c            |  2 +-
 net/l2tp/l2tp_ip.c          |  2 +-
 net/l2tp/l2tp_ppp.c         |  3 +--
 net/llc/af_llc.c            |  2 +-
 net/netlink/af_netlink.c    |  2 +-
 net/netrom/af_netrom.c      |  2 +-
 net/nfc/llcp_commands.c     |  4 ++--
 net/nfc/rawsock.c           |  2 +-
 net/packet/af_packet.c      |  5 ++---
 net/phonet/datagram.c       |  2 +-
 net/phonet/pep.c            |  2 +-
 net/rose/af_rose.c          |  2 +-
 net/sctp/sm_make_chunk.c    |  4 ++--
 net/x25/af_x25.c            |  2 +-
 36 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 83187f497c7c..c3b482bee208 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -298,9 +298,9 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
 			len = min_t(unsigned long, len,
 				    PAGE_SIZE - sg->offset - sg->length);
 
-			err = memcpy_fromiovec(page_address(sg_page(sg)) +
-					       sg->offset + sg->length,
-					       msg->msg_iov, len);
+			err = memcpy_from_msg(page_address(sg_page(sg)) +
+					      sg->offset + sg->length,
+					      msg, len);
 			if (err)
 				goto unlock;
 
@@ -337,8 +337,8 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
 			if (!sg_page(sg + i))
 				goto unlock;
 
-			err = memcpy_fromiovec(page_address(sg_page(sg + i)),
-					       msg->msg_iov, plen);
+			err = memcpy_from_msg(page_address(sg_page(sg + i)),
+					      msg, plen);
 			if (err) {
 				__free_page(sg_page(sg + i));
 				sg_assign_page(sg + i, NULL);
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index dcbd8589f0c4..84b35925ee4d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -203,7 +203,7 @@ mISDN_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (!skb)
 		goto done;
 
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		err = -EFAULT;
 		goto done;
 	}
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 443cbbf5c55f..d2408a5e43a6 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -869,7 +869,7 @@ static int pppoe_sendmsg(struct kiocb *iocb, struct socket *sock,
 	ph = (struct pppoe_hdr *)skb_put(skb, total_len + sizeof(struct pppoe_hdr));
 	start = (char *)&ph->tag[0];
 
-	error = memcpy_fromiovec(start, m->msg_iov, total_len);
+	error = memcpy_from_msg(start, m, total_len);
 	if (error < 0) {
 		kfree_skb(skb);
 		goto end;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cbe4b2078b30..97dc5f8123b3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2687,6 +2687,11 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
+static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
+{
+	return memcpy_fromiovec(data, msg->msg_iov, len);
+}
+
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
 	__wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 72a31db47ded..487ef34bbd63 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -219,7 +219,7 @@ struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *,
 				      const struct sctp_chunk *,
 				      __u32 tsn);
 struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *,
-					const struct msghdr *, size_t msg_len);
+					struct msghdr *, size_t msg_len);
 struct sctp_chunk *sctp_make_abort_violation(const struct sctp_association *,
 				   const struct sctp_chunk *,
 				   const __u8 *,
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 425942db17f6..0d0766ea5ab1 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1659,7 +1659,7 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
 
 	SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
 
-	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (err) {
 		kfree_skb(skb);
 		err = -EFAULT;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index f4f835e19378..ca049a7c9287 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1549,7 +1549,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reserve(skb, size - len);
 
 	/* User data follows immediately after the AX.25 data */
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		err = -EFAULT;
 		kfree_skb(skb);
 		goto out;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 5e2cd2535978..2c245fdf319a 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -947,7 +947,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (!skb)
 		goto done;
 
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		err = -EFAULT;
 		goto drop;
 	}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index cbeef5f62f3b..f3e4a16fb157 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -5767,7 +5767,7 @@ int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen)
 	if (!buf)
 		return -ENOMEM;
 
-	if (memcpy_fromiovec(buf, msg->msg_iov, msglen)) {
+	if (memcpy_from_msg(buf, msg, msglen)) {
 		err = -EFAULT;
 		goto done;
 	}
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 8bbbb5ec468c..2348176401a0 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -588,7 +588,7 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 		}
 		skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
 
-		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		err = memcpy_from_msg(skb_put(skb, size), msg, size);
 		if (err) {
 			kfree_skb(skb);
 			if (sent == 0)
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 7ee9e4ab00f8..30e5ea3f1ad3 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -285,7 +285,7 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
 	if (!skb)
 		return err;
 
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		kfree_skb(skb);
 		return -EFAULT;
 	}
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index fbcd156099fb..230f14026c11 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -566,7 +566,7 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 	skb_reserve(skb, cf_sk->headroom);
 
-	ret = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	ret = memcpy_from_msg(skb_put(skb, len), msg, len);
 
 	if (ret)
 		goto err;
@@ -641,7 +641,7 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));
 
-		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		err = memcpy_from_msg(skb_put(skb, size), msg, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index dcb75c0e66c1..b9a1f715df18 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -858,8 +858,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 
 		/* update can_frames content */
 		for (i = 0; i < msg_head->nframes; i++) {
-			err = memcpy_fromiovec((u8 *)&op->frames[i],
-					       msg->msg_iov, CFSIZ);
+			err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
 
 			if (op->frames[i].can_dlc > 8)
 				err = -EINVAL;
@@ -894,8 +893,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			op->frames = &op->sframe;
 
 		for (i = 0; i < msg_head->nframes; i++) {
-			err = memcpy_fromiovec((u8 *)&op->frames[i],
-					       msg->msg_iov, CFSIZ);
+			err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
 
 			if (op->frames[i].can_dlc > 8)
 				err = -EINVAL;
@@ -1024,9 +1022,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 
 		if (msg_head->nframes) {
 			/* update can_frames content */
-			err = memcpy_fromiovec((u8 *)op->frames,
-					       msg->msg_iov,
-					       msg_head->nframes * CFSIZ);
+			err = memcpy_from_msg((u8 *)op->frames, msg,
+					      msg_head->nframes * CFSIZ);
 			if (err < 0)
 				return err;
 
@@ -1072,8 +1069,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 		}
 
 		if (msg_head->nframes) {
-			err = memcpy_fromiovec((u8 *)op->frames, msg->msg_iov,
-					       msg_head->nframes * CFSIZ);
+			err = memcpy_from_msg((u8 *)op->frames, msg,
+					      msg_head->nframes * CFSIZ);
 			if (err < 0) {
 				if (op->frames != &op->sframe)
 					kfree(op->frames);
@@ -1209,7 +1206,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
 
 	can_skb_reserve(skb);
 
-	err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ);
+	err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ);
 	if (err < 0) {
 		kfree_skb(skb);
 		return err;
@@ -1285,7 +1282,7 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	/* read message head information */
 
-	ret = memcpy_fromiovec((u8 *)&msg_head, msg->msg_iov, MHSIZ);
+	ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
 	if (ret < 0)
 		return ret;
 
diff --git a/net/can/raw.c b/net/can/raw.c
index 081e81fd017f..0e4004fb6876 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -703,7 +703,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
 
-	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
 		goto free_skb;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8e6ae9422a7b..19f038739087 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -781,7 +781,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		goto out_release;
 
 	skb_reserve(skb, sk->sk_prot->max_header);
-	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	rc = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (rc != 0)
 		goto out_discard;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 25733d538147..e2e2e3cb9113 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -2032,7 +2032,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 		skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
 
-		if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+		if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 			err = -EFAULT;
 			goto out;
 		}
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index b8555ec71387..2c7a93e7167e 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -276,7 +276,7 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk,
 	if (err < 0)
 		goto out_skb;
 
-	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
 		goto out_skb;
 
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 21c38945ab8b..61e9d2972947 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -150,7 +150,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk,
 	skb_reset_mac_header(skb);
 	skb_reset_network_header(skb);
 
-	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
 		goto out_skb;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index ce2920f5bef3..ef8f6ee90473 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -660,7 +660,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
 	 *	Fetch the ICMP header provided by the userland.
 	 *	iovec is modified! The ICMP header is consumed.
 	 */
-	if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len))
+	if (memcpy_from_msg(user_icmph, msg, icmph_len))
 		return -EFAULT;
 
 	if (family == AF_INET) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d22a31f27ab4..69de1a1c05c9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+	if (memcpy_from_msg(skb_put(skb, size), msg, size))
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index e8c409055922..9052462cf42a 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1319,7 +1319,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reserve(skb, self->max_header_size + 16);
 	skb_reset_transport_header(skb);
 	skb_put(skb, len);
-	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_transport_header(skb), msg, len);
 	if (err) {
 		kfree_skb(skb);
 		goto out_err;
@@ -1569,7 +1569,7 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 
 	pr_debug("%s(), appending user data\n", __func__);
 	skb_put(skb, len);
-	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_transport_header(skb), msg, len);
 	if (err) {
 		kfree_skb(skb);
 		goto out;
@@ -1678,7 +1678,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 
 	pr_debug("%s(), appending user data\n", __func__);
 	skb_put(skb, len);
-	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_transport_header(skb), msg, len);
 	if (err) {
 		kfree_skb(skb);
 		goto out;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 057b5647ef92..1cd3f8107239 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1122,7 +1122,7 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	}
 	if (iucv->transport == AF_IUCV_TRANS_HIPER)
 		skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN);
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		err = -EFAULT;
 		goto fail;
 	}
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e5883091a8c6..f8ac939d52b4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3611,7 +3611,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb,
 		goto out;
 
 	err = -EFAULT;
-	if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len))
+	if (memcpy_from_msg(skb_put(skb,len), msg, len))
 		goto out;
 
 	hdr = pfkey_get_base_msg(skb, &err);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a6cc1fed2b52..05dfc8aa36af 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -441,7 +441,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 	*((__be32 *) skb_put(skb, 4)) = 0;
 
 	/* Copy user data into skb */
-	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	rc = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (rc < 0) {
 		kfree_skb(skb);
 		goto error;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index c559bcdf4679..cc7a828fc914 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -346,8 +346,7 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 	skb_put(skb, 2);
 
 	/* Copy user data into skb */
-	error = memcpy_fromiovec(skb_put(skb, total_len), m->msg_iov,
-				 total_len);
+	error = memcpy_from_msg(skb_put(skb, total_len), m, total_len);
 	if (error < 0) {
 		kfree_skb(skb);
 		goto error_put_sess_tun;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index af662669f951..2c0b83ce43bd 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -921,7 +921,7 @@ static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb->dev      = llc->dev;
 	skb->protocol = llc_proto_type(addr->sllc_arphrd);
 	skb_reserve(skb, hdrlen);
-	rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied);
+	rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
 	if (rc)
 		goto out;
 	if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e1aad6eeac14..63aa5c8acf12 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2325,7 +2325,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	NETLINK_CB(skb).flags	= netlink_skb_flags;
 
 	err = -EFAULT;
-	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
 		kfree_skb(skb);
 		goto out;
 	}
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 7e13f6afcd1f..69f1d5e9959f 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1113,7 +1113,7 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb_put(skb, len);
 
 	/* User data follows immediately after the NET/ROM transport header */
-	if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) {
+	if (memcpy_from_msg(skb_transport_header(skb), msg, len)) {
 		kfree_skb(skb);
 		err = -EFAULT;
 		goto out;
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index a3ad69a4c648..c4da0c2d8a14 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -665,7 +665,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock,
 	if (msg_data == NULL)
 		return -ENOMEM;
 
-	if (memcpy_fromiovec(msg_data, msg->msg_iov, len)) {
+	if (memcpy_from_msg(msg_data, msg, len)) {
 		kfree(msg_data);
 		return -EFAULT;
 	}
@@ -731,7 +731,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
 	if (msg_data == NULL)
 		return -ENOMEM;
 
-	if (memcpy_fromiovec(msg_data, msg->msg_iov, len)) {
+	if (memcpy_from_msg(msg_data, msg, len)) {
 		kfree(msg_data);
 		return -EFAULT;
 	}
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 9d7d2b7ba5e4..373e138c0ab6 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -231,7 +231,7 @@ static int rawsock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (skb == NULL)
 		return rc;
 
-	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	rc = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (rc < 0) {
 		kfree_skb(skb);
 		return rc;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 58af58026dd2..07ac95014ecb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1676,7 +1676,7 @@ retry:
 			if (len < hhlen)
 				skb_reset_network_header(skb);
 		}
-		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+		err = memcpy_from_msg(skb_put(skb, len), msg, len);
 		if (err)
 			goto out_free;
 		goto retry;
@@ -2446,8 +2446,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 		len -= vnet_hdr_len;
 
-		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
-				       vnet_hdr_len);
+		err = memcpy_from_msg((void *)&vnet_hdr, msg, vnet_hdr_len);
 		if (err < 0)
 			goto out_unlock;
 
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 0918bc21eae6..26054b4b467c 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -109,7 +109,7 @@ static int pn_sendmsg(struct kiocb *iocb, struct sock *sk,
 		return err;
 	skb_reserve(skb, MAX_PHONET_HEADER);
 
-	err = memcpy_fromiovec((void *)skb_put(skb, len), msg->msg_iov, len);
+	err = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
 	if (err < 0) {
 		kfree_skb(skb);
 		return err;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 9cd069dfaf65..5d3f2b7507d4 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1141,7 +1141,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
 		return err;
 
 	skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned);
-	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (err < 0)
 		goto outfree;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9b600c20a7a3..43bac7c4dd9e 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1121,7 +1121,7 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reset_transport_header(skb);
 	skb_put(skb, len);
 
-	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	err = memcpy_from_msg(skb_transport_header(skb), msg, len);
 	if (err) {
 		kfree_skb(skb);
 		return err;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9f32741abb1c..e49bccebb0cc 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1001,7 +1001,7 @@ no_mem:
 
 /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error.  */
 struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
-					const struct msghdr *msg,
+					struct msghdr *msg,
 					size_t paylen)
 {
 	struct sctp_chunk *retval;
@@ -1018,7 +1018,7 @@ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
 		if (!payload)
 			goto err_payload;
 
-		err = memcpy_fromiovec(payload, msg->msg_iov, paylen);
+		err = memcpy_from_msg(payload, msg, paylen);
 		if (err < 0)
 			goto err_copy;
 	}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 59e785bfde65..d9149b68b9bc 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1170,7 +1170,7 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reset_transport_header(skb);
 	skb_put(skb, len);
 
-	rc = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	rc = memcpy_from_msg(skb_transport_header(skb), msg, len);
 	if (rc)
 		goto out_kfree_skb;
 
-- 
cgit v1.2.3-58-ga151


From 7eab8d9e8a722ca07bc785f73e21c3d3418defa6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Apr 2014 21:51:23 -0400
Subject: new helper: memcpy_to_msg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/algif_hash.c    | 2 +-
 include/linux/skbuff.h | 5 +++++
 net/caif/caif_socket.c | 2 +-
 net/can/bcm.c          | 2 +-
 net/can/raw.c          | 2 +-
 net/decnet/af_decnet.c | 2 +-
 net/ipv4/tcp.c         | 2 +-
 net/irda/af_irda.c     | 2 +-
 net/packet/af_packet.c | 3 +--
 9 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 850246206b12..35c93ff11f35 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -174,7 +174,7 @@ static int hash_recvmsg(struct kiocb *unused, struct socket *sock,
 			goto unlock;
 	}
 
-	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
+	err = memcpy_to_msg(msg, ctx->result, len);
 
 unlock:
 	release_sock(sk);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97dc5f8123b3..d048347a010a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2692,6 +2692,11 @@ static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 	return memcpy_fromiovec(data, msg->msg_iov, len);
 }
 
+static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
+{
+	return memcpy_toiovec(msg->msg_iov, data, len);
+}
+
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
 	__wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 230f14026c11..ac618b0b8a4f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -418,7 +418,7 @@ unlock:
 		}
 		release_sock(sk);
 		chunk = min_t(unsigned int, skb->len, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+		if (memcpy_to_msg(msg, skb->data, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (copied == 0)
 				copied = -EFAULT;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b9a1f715df18..01671187e3fe 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1555,7 +1555,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (skb->len < size)
 		size = skb->len;
 
-	err = memcpy_toiovec(msg->msg_iov, skb->data, size);
+	err = memcpy_to_msg(msg, skb->data, size);
 	if (err < 0) {
 		skb_free_datagram(sk, skb);
 		return err;
diff --git a/net/can/raw.c b/net/can/raw.c
index 0e4004fb6876..dfdcffbb1070 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -750,7 +750,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else
 		size = skb->len;
 
-	err = memcpy_toiovec(msg->msg_iov, skb->data, size);
+	err = memcpy_to_msg(msg, skb->data, size);
 	if (err < 0) {
 		skb_free_datagram(sk, skb);
 		return err;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index e2e2e3cb9113..810228646de3 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1760,7 +1760,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if ((chunk + copied) > size)
 			chunk = size - copied;
 
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+		if (memcpy_to_msg(msg, skb->data, chunk)) {
 			rv = -EFAULT;
 			break;
 		}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c239f4740d10..435443bfc3c3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1349,7 +1349,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
 
 		if (len > 0) {
 			if (!(flags & MSG_TRUNC))
-				err = memcpy_toiovec(msg->msg_iov, &c, 1);
+				err = memcpy_to_msg(msg, &c, 1);
 			len = 1;
 		} else
 			msg->msg_flags |= MSG_TRUNC;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 9052462cf42a..568edc72d737 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1466,7 +1466,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 		}
 
 		chunk = min_t(unsigned int, skb->len, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+		if (memcpy_to_msg(msg, skb->data, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (copied == 0)
 				copied = -EFAULT;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 07ac95014ecb..108d7f381b87 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2949,8 +2949,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 			vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
-				     vnet_hdr_len);
+		err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
 		if (err < 0)
 			goto out_free;
 	}
-- 
cgit v1.2.3-58-ga151


From 3a654f975bf99165016fe257a3d2b4e6716e4931 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 19 Jun 2014 14:15:22 -0400
Subject: new helpers: skb_copy_datagram_from_iter() and
 zerocopy_sg_from_iter()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |   3 ++
 net/core/datagram.c    | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d048347a010a..a01cd9ad0b51 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2659,10 +2659,13 @@ static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 const struct iovec *from, int from_offset,
 				 int len);
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 26391a3fe3e5..3ea038431ceb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -572,6 +572,78 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
 
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+				 struct iov_iter *from,
+				 int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_from_iter(skb->data + offset, copy, from) != copy)
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			size_t copied;
+
+			if (copy > len)
+				copy = len;
+			copied = copy_page_from_iter(skb_frag_page(frag),
+					  frag->page_offset + offset - start,
+					  copy, from);
+			if (copied != copy)
+				goto fault;
+
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_from_iter(frag_iter,
+							offset - start,
+							from, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
+
 /**
  *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
  *	@skb: buffer to copy
@@ -643,6 +715,50 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iovec);
 
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+	int len = iov_iter_count(from);
+	int copy = min_t(int, skb_headlen(skb), len);
+	int frag = 0;
+
+	/* copy up to skb headlen */
+	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+		return -EFAULT;
+
+	while (iov_iter_count(from)) {
+		struct page *pages[MAX_SKB_FRAGS];
+		size_t start;
+		ssize_t copied;
+		unsigned long truesize;
+		int n = 0;
+
+		if (frag == MAX_SKB_FRAGS)
+			return -EMSGSIZE;
+
+		copied = iov_iter_get_pages(from, pages, ~0U,
+					    MAX_SKB_FRAGS - frag, &start);
+		if (copied < 0)
+			return -EFAULT;
+
+		iov_iter_advance(from, copied);
+
+		truesize = PAGE_ALIGN(copied + start);
+		skb->data_len += copied;
+		skb->len += copied;
+		skb->truesize += truesize;
+		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+		while (copied) {
+			int size = min_t(int, copied, PAGE_SIZE - start);
+			skb_fill_page_desc(skb, frag++, pages[n], start, size);
+			start = 0;
+			copied -= size;
+			n++;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
+
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      u8 __user *to, int len,
 				      __wsum *csump)
-- 
cgit v1.2.3-58-ga151


From 195e952d03a797aa953f62ffe24ec58693e17ed8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 6 Nov 2014 00:56:48 -0500
Subject: kill zerocopy_sg_from_iovec()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  2 --
 net/core/datagram.c    | 65 ++------------------------------------------------
 2 files changed, 2 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a01cd9ad0b51..178cdbde82f0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2661,8 +2661,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
-int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
-			   int offset, size_t count);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 3ea038431ceb..c4d832efebb8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -645,76 +645,15 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
 /**
- *	zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
+ *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
  *	@skb: buffer to copy
- *	@from: io vector to copy from
- *	@offset: offset in the io vector to start copying from
- *	@count: amount of vectors to copy to buffer from
+ *	@from: the source to copy from
  *
  *	The function will first copy up to headlen, and then pin the userspace
  *	pages and build frags through them.
  *
  *	Returns 0, -EFAULT or -EMSGSIZE.
- *	Note: the iovec is not modified during the copy
  */
-int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
-				  int offset, size_t count)
-{
-	int len = iov_length(from, count) - offset;
-	int copy = min_t(int, skb_headlen(skb), len);
-	int size;
-	int i = 0;
-
-	/* copy up to skb headlen */
-	if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
-		return -EFAULT;
-
-	if (len == copy)
-		return 0;
-
-	offset += copy;
-	while (count--) {
-		struct page *page[MAX_SKB_FRAGS];
-		int num_pages;
-		unsigned long base;
-		unsigned long truesize;
-
-		/* Skip over from offset and copied */
-		if (offset >= from->iov_len) {
-			offset -= from->iov_len;
-			++from;
-			continue;
-		}
-		len = from->iov_len - offset;
-		base = (unsigned long)from->iov_base + offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		if (i + size > MAX_SKB_FRAGS)
-			return -EMSGSIZE;
-		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
-		if (num_pages != size) {
-			release_pages(&page[i], num_pages, 0);
-			return -EFAULT;
-		}
-		truesize = size * PAGE_SIZE;
-		skb->data_len += len;
-		skb->len += len;
-		skb->truesize += truesize;
-		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
-		while (len) {
-			int off = base & ~PAGE_MASK;
-			int size = min_t(int, len, PAGE_SIZE - off);
-			skb_fill_page_desc(skb, i, page[i], off, size);
-			base += size;
-			len -= size;
-			i++;
-		}
-		offset = 0;
-		++from;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(zerocopy_sg_from_iovec);
-
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 {
 	int len = iov_iter_count(from);
-- 
cgit v1.2.3-58-ga151


From 8feb2fb2bb986c533e18037d3c45a5f779421992 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 6 Nov 2014 01:10:59 -0500
Subject: switch AF_PACKET and AF_UNIX to skb_copy_datagram_from_iter()

... and kill skb_copy_datagram_iovec()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  3 --
 net/core/datagram.c    | 88 ++------------------------------------------------
 net/packet/af_packet.c | 11 +++++--
 net/unix/af_unix.c     | 11 +++++--
 4 files changed, 18 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 178cdbde82f0..7691ad5b4771 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2656,9 +2656,6 @@ static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 {
 	return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov);
 }
-int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
-				 const struct iovec *from, int from_offset,
-				 int len);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index c4d832efebb8..b6e303b0f01f 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -480,98 +480,14 @@ short_copy:
 EXPORT_SYMBOL(skb_copy_datagram_iter);
 
 /**
- *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ *	skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to
- *	@from: io vector to copy to
- *	@from_offset: offset in the io vector to start copying from
+ *	@from: the copy source
  *	@len: amount of data to copy to buffer from iovec
  *
  *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
  */
-int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
-				 const struct iovec *from, int from_offset,
-				 int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
-					copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		from_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_fromiovecend(vaddr + frag->page_offset +
-						  offset - start,
-						  from, from_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			from_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_from_iovec(frag_iter,
-							 offset - start,
-							 from,
-							 from_offset,
-							 copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			from_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
-
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from,
 				 int len)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 108d7f381b87..dfb148e9fdaa 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2408,6 +2408,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	unsigned short gso_type = 0;
 	int hlen, tlen;
 	int extra_len = 0;
+	struct iov_iter from;
+	ssize_t n;
+
+	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	/*
 	 *	Get and verify the address.
@@ -2446,8 +2450,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 		len -= vnet_hdr_len;
 
-		err = memcpy_from_msg((void *)&vnet_hdr, msg, vnet_hdr_len);
-		if (err < 0)
+		err = -EFAULT;
+		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &from);
+		if (n != vnet_hdr_len)
 			goto out_unlock;
 
 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
@@ -2517,7 +2522,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	}
 
 	/* Returns -EFAULT on error */
-	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
+	err = skb_copy_datagram_from_iter(skb, offset, &from, len);
 	if (err)
 		goto out_free;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5eee625d113f..4450d6226602 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1459,6 +1459,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	int max_level;
 	int data_len = 0;
+	struct iov_iter from;
+
+	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1516,7 +1519,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	skb_put(skb, len - data_len);
 	skb->data_len = data_len;
 	skb->len = len;
-	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
+	err = skb_copy_datagram_from_iter(skb, 0, &from, len);
 	if (err)
 		goto out_free;
 
@@ -1638,6 +1641,9 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	bool fds_sent = false;
 	int max_level;
 	int data_len;
+	struct iov_iter from;
+
+	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1694,8 +1700,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		skb_put(skb, size - data_len);
 		skb->data_len = data_len;
 		skb->len = size;
-		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
-						   sent, size);
+		err = skb_copy_datagram_from_iter(skb, 0, &from, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
-- 
cgit v1.2.3-58-ga151


From 2ad7bf3638411cb547f2823df08166c13ab04269 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sun, 23 Nov 2014 23:07:46 -0800
Subject: ipvlan: Initial check-in of the IPVLAN driver.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This driver is very similar to the macvlan driver except that it
uses L3 on the frame to determine the logical interface while
functioning as packet dispatcher. It inherits L2 of the master
device hence the packets on wire will have the same L2 for all
the packets originating from all virtual devices off of the same
master device.

This driver was developed keeping the namespace use-case in
mind. Hence most of the examples given here take that as the
base setup where main-device belongs to the default-ns and
virtual devices are assigned to the additional namespaces.

The device operates in two different modes and the difference
in these two modes in primarily in the TX side.

(a) L2 mode : In this mode, the device behaves as a L2 device.
TX processing upto L2 happens on the stack of the virtual device
associated with (namespace). Packets are switched after that
into the main device (default-ns) and queued for xmit.

RX processing is simple and all multicast, broadcast (if
applicable), and unicast belonging to the address(es) are
delivered to the virtual devices.

(b) L3 mode : In this mode, the device behaves like a L3 device.
TX processing upto L3 happens on the stack of the virtual device
associated with (namespace). Packets are switched to the
main-device (default-ns) for the L2 processing. Hence the routing
table of the default-ns will be used in this mode.

RX processins is somewhat similar to the L2 mode except that in
this mode only Unicast packets are delivered to the virtual device
while main-dev will handle all other packets.

The devices can be added using the "ip" command from the iproute2
package -

	ip link add link <master> <virtual> type ipvlan mode [ l2 | l3 ]

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Laurent Chavey <chavey@google.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Brandon Philips <brandon.philips@coreos.com>
Cc: Pavel Emelianov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt | 107 +++++
 drivers/net/Kconfig                 |  18 +
 drivers/net/Makefile                |   1 +
 drivers/net/ipvlan/Makefile         |   7 +
 drivers/net/ipvlan/ipvlan.h         | 130 ++++++
 drivers/net/ipvlan/ipvlan_core.c    | 607 +++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 789 ++++++++++++++++++++++++++++++++++++
 include/linux/netdevice.h           |   4 +
 include/uapi/linux/if_link.h        |  15 +
 9 files changed, 1678 insertions(+)
 create mode 100644 Documentation/networking/ipvlan.txt
 create mode 100644 drivers/net/ipvlan/Makefile
 create mode 100644 drivers/net/ipvlan/ipvlan.h
 create mode 100644 drivers/net/ipvlan/ipvlan_core.c
 create mode 100644 drivers/net/ipvlan/ipvlan_main.c

(limited to 'include/linux')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
new file mode 100644
index 000000000000..cf996394e466
--- /dev/null
+++ b/Documentation/networking/ipvlan.txt
@@ -0,0 +1,107 @@
+
+                            IPVLAN Driver HOWTO
+
+Initial Release:
+	Mahesh Bandewar <maheshb AT google.com>
+
+1. Introduction:
+	This is conceptually very similar to the macvlan driver with one major
+exception of using L3 for mux-ing /demux-ing among slaves. This property makes
+the master device share the L2 with it's slave devices. I have developed this
+driver in conjuntion with network namespaces and not sure if there is use case
+outside of it.
+
+
+2. Building and Installation:
+	In order to build the driver, please select the config item CONFIG_IPVLAN.
+The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
+(CONFIG_IPVLAN=m).
+
+
+3. Configuration:
+	There are no module parameters for this driver and it can be configured
+using IProute2/ip utility.
+
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+
+	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
+
+
+4. Operating modes:
+	IPvlan has two modes of operation - L2 and L3. For a given master device,
+you can select one of these two modes and all slaves on that master will
+operate in the same (selected) mode. The RX mode is almost identical except
+that in L3 mode the slaves wont receive any multicast / broadcast traffic.
+L3 mode is more restrictive since routing is controlled from the other (mostly)
+default namespace.
+
+4.1 L2 mode:
+	In this mode TX processing happens on the stack instance attached to the
+slave device and packets are switched and queued to the master device to send
+out. In this mode the slaves will RX/TX multicast and broadcast (if applicable)
+as well.
+
+4.2 L3 mode:
+	In this mode TX processing upto L3 happens on the stack instance attached
+to the slave device and packets are switched to the stack instance of the
+master device for the L2 processing and routing from that instance will be
+used before packets are queued on the outbound device. In this mode the slaves
+will not receive nor can send multicast / broadcast traffic.
+
+
+5. What to choose (macvlan vs. ipvlan)?
+	These two devices are very similar in many regards and the specific use
+case could very well define which device to choose. if one of the following
+situations defines your use case then you can choose to use ipvlan -
+	(a) The Linux host that is connected to the external switch / router has
+policy configured that allows only one mac per port.
+	(b) No of virtual devices created on a master exceed the mac capacity and
+puts the NIC in promiscous mode and degraded performance is a concern.
+	(c) If the slave device is to be put into the hostile / untrusted network
+namespace where L2 on the slave could be changed / misused.
+
+
+6. Example configuration:
+
+  +=============================================================+
+  |  Host: host1                                                |
+  |                                                             |
+  |   +----------------------+      +----------------------+    |
+  |   |   NS:ns0             |      |  NS:ns1              |    |
+  |   |                      |      |                      |    |
+  |   |                      |      |                      |    |
+  |   |        ipvl0         |      |         ipvl1        |    |
+  |   +----------#-----------+      +-----------#----------+    |
+  |              #                              #               |
+  |              ################################               |
+  |                              # eth0                         |
+  +==============================#==============================+
+
+
+	(a) Create two network namespaces - ns0, ns1
+		ip netns add ns0
+		ip netns add ns1
+
+	(b) Create two ipvlan slaves on eth0 (master device)
+		ip link add link eth0 ipvl0 type ipvlan mode l2
+		ip link add link eth0 ipvl1 type ipvlan mode l2
+
+	(c) Assign slaves to the respective network namespaces
+		ip link set dev ipvl0 netns ns0
+		ip link set dev ipvl1 netns ns1
+
+	(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices
+		- For ns0
+			(1) ip netns exec ns0 bash
+			(2) ip link set dev ipvl0 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl0
+			(6) ip -4 route add default via $ROUTER dev ipvl0
+		- For ns1
+			(1) ip netns exec ns1 bash
+			(2) ip link set dev ipvl1 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl1
+			(6) ip -4 route add default via $ROUTER dev ipvl1
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index f9009be3f307..b6d64f546574 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -145,6 +145,24 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+
+config IPVLAN
+    tristate "IP-VLAN support"
+    ---help---
+      This allows one to create virtual devices off of a main interface
+      and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
+      on packets. All interfaces (including the main interface) share L2
+      making it transparent to the connected L2 switch.
+
+      Ipvlan devices can be added using the "ip" command from the
+      iproute2 package starting with the iproute2-X.Y.ZZ release:
+
+      "ip link add link <main-dev> [ NAME ] type ipvlan"
+
+      To compile this driver as a module, choose M here: the module
+      will be called ipvlan.
+
+
 config VXLAN
        tristate "Virtual eXtensible Local Area Network (VXLAN)"
        depends on INET
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 61aefdd1e173..e25fdd7d905e 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -6,6 +6,7 @@
 # Networking Core Drivers
 #
 obj-$(CONFIG_BONDING) += bonding/
+obj-$(CONFIG_IPVLAN) += ipvlan/
 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_EQUALIZER) += eql.o
 obj-$(CONFIG_IFB) += ifb.o
diff --git a/drivers/net/ipvlan/Makefile b/drivers/net/ipvlan/Makefile
new file mode 100644
index 000000000000..df79910192d6
--- /dev/null
+++ b/drivers/net/ipvlan/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Ethernet Ipvlan driver
+#
+
+obj-$(CONFIG_IPVLAN) += ipvlan.o
+
+ipvlan-objs := ipvlan_core.o ipvlan_main.o
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
new file mode 100644
index 000000000000..ab3e7614ed71
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+#ifndef __IPVLAN_H
+#define __IPVLAN_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_link.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/inetdevice.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+#include <net/route.h>
+#include <net/addrconf.h>
+
+#define IPVLAN_DRV	"ipvlan"
+#define IPV_DRV_VER	"0.1"
+
+#define IPVLAN_HASH_SIZE	(1 << BITS_PER_BYTE)
+#define IPVLAN_HASH_MASK	(IPVLAN_HASH_SIZE - 1)
+
+#define IPVLAN_MAC_FILTER_BITS	8
+#define IPVLAN_MAC_FILTER_SIZE	(1 << IPVLAN_MAC_FILTER_BITS)
+#define IPVLAN_MAC_FILTER_MASK	(IPVLAN_MAC_FILTER_SIZE - 1)
+
+typedef enum {
+	IPVL_IPV6 = 0,
+	IPVL_ICMPV6,
+	IPVL_IPV4,
+	IPVL_ARP,
+} ipvl_hdr_type;
+
+struct ipvl_pcpu_stats {
+	u64			rx_pkts;
+	u64			rx_bytes;
+	u64			rx_mcast;
+	u64			tx_pkts;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			rx_errs;
+	u32			tx_drps;
+};
+
+struct ipvl_port;
+
+struct ipvl_dev {
+	struct net_device	*dev;
+	struct list_head	pnode;
+	struct ipvl_port	*port;
+	struct net_device	*phy_dev;
+	struct list_head	addrs;
+	int			ipv4cnt;
+	int			ipv6cnt;
+	struct ipvl_pcpu_stats	*pcpu_stats;
+	DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
+	netdev_features_t	sfeatures;
+	u32			msg_enable;
+	u16			mtu_adj;
+};
+
+struct ipvl_addr {
+	struct ipvl_dev		*master; /* Back pointer to master */
+	union {
+		struct in6_addr	ip6;	 /* IPv6 address on logical interface */
+		struct in_addr	ip4;	 /* IPv4 address on logical interface */
+	} ipu;
+#define ip6addr	ipu.ip6
+#define ip4addr ipu.ip4
+	struct hlist_node	hlnode;  /* Hash-table linkage */
+	struct list_head	anode;   /* logical-interface linkage */
+	struct rcu_head		rcu;
+	ipvl_hdr_type		atype;
+};
+
+struct ipvl_port {
+	struct net_device	*dev;
+	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
+	struct list_head	ipvlans;
+	struct rcu_head		rcu;
+	int			count;
+	u16			mode;
+};
+
+static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
+{
+	return rcu_dereference(d->rx_handler_data);
+}
+
+static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
+{
+	return rtnl_dereference(d->rx_handler_data);
+}
+
+static inline bool ipvlan_dev_master(struct net_device *d)
+{
+	return d->priv_flags & IFF_IPVLAN_MASTER;
+}
+
+static inline bool ipvlan_dev_slave(struct net_device *d)
+{
+	return d->priv_flags & IFF_IPVLAN_SLAVE;
+}
+
+void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev);
+void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
+void ipvlan_init_secret(void);
+unsigned int ipvlan_mac_hash(const unsigned char *addr);
+rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
+int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
+void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
+bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6);
+struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
+					const void *iaddr, bool is_v6);
+void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync);
+#endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
new file mode 100644
index 000000000000..a14d87783245
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -0,0 +1,607 @@
+/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#include "ipvlan.h"
+
+static u32 ipvlan_jhash_secret;
+
+void ipvlan_init_secret(void)
+{
+	net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret));
+}
+
+static void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
+			    unsigned int len, bool success, bool mcast)
+{
+	if (!ipvlan)
+		return;
+
+	if (likely(success)) {
+		struct ipvl_pcpu_stats *pcptr;
+
+		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
+		u64_stats_update_begin(&pcptr->syncp);
+		pcptr->rx_pkts++;
+		pcptr->rx_bytes += len;
+		if (mcast)
+			pcptr->rx_mcast++;
+		u64_stats_update_end(&pcptr->syncp);
+	} else {
+		this_cpu_inc(ipvlan->pcpu_stats->rx_errs);
+	}
+}
+
+static u8 ipvlan_get_v6_hash(const void *iaddr)
+{
+	const struct in6_addr *ip6_addr = iaddr;
+
+	return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) &
+	       IPVLAN_HASH_MASK;
+}
+
+static u8 ipvlan_get_v4_hash(const void *iaddr)
+{
+	const struct in_addr *ip4_addr = iaddr;
+
+	return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) &
+	       IPVLAN_HASH_MASK;
+}
+
+struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
+					const void *iaddr, bool is_v6)
+{
+	struct ipvl_addr *addr;
+	u8 hash;
+
+	hash = is_v6 ? ipvlan_get_v6_hash(iaddr) :
+	       ipvlan_get_v4_hash(iaddr);
+	hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) {
+		if (is_v6 && addr->atype == IPVL_IPV6 &&
+		    ipv6_addr_equal(&addr->ip6addr, iaddr))
+			return addr;
+		else if (!is_v6 && addr->atype == IPVL_IPV4 &&
+			 addr->ip4addr.s_addr ==
+				((struct in_addr *)iaddr)->s_addr)
+			return addr;
+	}
+	return NULL;
+}
+
+void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr)
+{
+	struct ipvl_port *port = ipvlan->port;
+	u8 hash;
+
+	hash = (addr->atype == IPVL_IPV6) ?
+	       ipvlan_get_v6_hash(&addr->ip6addr) :
+	       ipvlan_get_v4_hash(&addr->ip4addr);
+	hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
+}
+
+void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync)
+{
+	hlist_del_rcu(&addr->hlnode);
+	if (sync)
+		synchronize_rcu();
+}
+
+bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
+{
+	struct ipvl_port *port = ipvlan->port;
+	struct ipvl_addr *addr;
+
+	list_for_each_entry(addr, &ipvlan->addrs, anode) {
+		if ((is_v6 && addr->atype == IPVL_IPV6 &&
+		    ipv6_addr_equal(&addr->ip6addr, iaddr)) ||
+		    (!is_v6 && addr->atype == IPVL_IPV4 &&
+		    addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr))
+			return true;
+	}
+
+	if (ipvlan_ht_addr_lookup(port, iaddr, is_v6))
+		return true;
+
+	return false;
+}
+
+static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type)
+{
+	void *lyr3h = NULL;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_ARP): {
+		struct arphdr *arph;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*arph))))
+			return NULL;
+
+		arph = arp_hdr(skb);
+		*type = IPVL_ARP;
+		lyr3h = arph;
+		break;
+	}
+	case htons(ETH_P_IP): {
+		u32 pktlen;
+		struct iphdr *ip4h;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h))))
+			return NULL;
+
+		ip4h = ip_hdr(skb);
+		pktlen = ntohs(ip4h->tot_len);
+		if (ip4h->ihl < 5 || ip4h->version != 4)
+			return NULL;
+		if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
+			return NULL;
+
+		*type = IPVL_IPV4;
+		lyr3h = ip4h;
+		break;
+	}
+	case htons(ETH_P_IPV6): {
+		struct ipv6hdr *ip6h;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h))))
+			return NULL;
+
+		ip6h = ipv6_hdr(skb);
+		if (ip6h->version != 6)
+			return NULL;
+
+		*type = IPVL_IPV6;
+		lyr3h = ip6h;
+		/* Only Neighbour Solicitation pkts need different treatment */
+		if (ipv6_addr_any(&ip6h->saddr) &&
+		    ip6h->nexthdr == NEXTHDR_ICMP) {
+			*type = IPVL_ICMPV6;
+			lyr3h = ip6h + 1;
+		}
+		break;
+	}
+	default:
+		return NULL;
+	}
+
+	return lyr3h;
+}
+
+unsigned int ipvlan_mac_hash(const unsigned char *addr)
+{
+	u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2),
+			       ipvlan_jhash_secret);
+
+	return hash & IPVLAN_MAC_FILTER_MASK;
+}
+
+static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
+				   const struct ipvl_dev *in_dev, bool local)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_dev *ipvlan;
+	struct sk_buff *nskb;
+	unsigned int len;
+	unsigned int mac_hash;
+	int ret;
+
+	if (skb->protocol == htons(ETH_P_PAUSE))
+		return;
+
+	list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+		if (local && (ipvlan == in_dev))
+			continue;
+
+		mac_hash = ipvlan_mac_hash(eth->h_dest);
+		if (!test_bit(mac_hash, ipvlan->mac_filters))
+			continue;
+
+		ret = NET_RX_DROP;
+		len = skb->len + ETH_HLEN;
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb)
+			goto mcast_acct;
+
+		if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast))
+			nskb->pkt_type = PACKET_BROADCAST;
+		else
+			nskb->pkt_type = PACKET_MULTICAST;
+
+		nskb->dev = ipvlan->dev;
+		if (local)
+			ret = dev_forward_skb(ipvlan->dev, nskb);
+		else
+			ret = netif_rx(nskb);
+mcast_acct:
+		ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
+	}
+
+	/* Locally generated? ...Forward a copy to the main-device as
+	 * well. On the RX side we'll ignore it (wont give it to any
+	 * of the virtual devices.
+	 */
+	if (local) {
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (nskb) {
+			if (ether_addr_equal(eth->h_dest, port->dev->broadcast))
+				nskb->pkt_type = PACKET_BROADCAST;
+			else
+				nskb->pkt_type = PACKET_MULTICAST;
+
+			dev_forward_skb(port->dev, nskb);
+		}
+	}
+}
+
+static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb,
+			    bool local)
+{
+	struct ipvl_dev *ipvlan = addr->master;
+	struct net_device *dev = ipvlan->dev;
+	unsigned int len;
+	rx_handler_result_t ret = RX_HANDLER_CONSUMED;
+	bool success = false;
+
+	len = skb->len + ETH_HLEN;
+	if (unlikely(!(dev->flags & IFF_UP))) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	skb->dev = dev;
+	skb->pkt_type = PACKET_HOST;
+
+	if (local) {
+		if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
+			success = true;
+	} else {
+		ret = RX_HANDLER_ANOTHER;
+		success = true;
+	}
+
+out:
+	ipvlan_count_rx(ipvlan, len, success, false);
+	return ret;
+}
+
+static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port,
+					    void *lyr3h, int addr_type,
+					    bool use_dest)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if (addr_type == IPVL_IPV6) {
+		struct ipv6hdr *ip6h;
+		struct in6_addr *i6addr;
+
+		ip6h = (struct ipv6hdr *)lyr3h;
+		i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr;
+		addr = ipvlan_ht_addr_lookup(port, i6addr, true);
+	} else if (addr_type == IPVL_ICMPV6) {
+		struct nd_msg *ndmh;
+		struct in6_addr *i6addr;
+
+		/* Make sure that the NeighborSolicitation ICMPv6 packets
+		 * are handled to avoid DAD issue.
+		 */
+		ndmh = (struct nd_msg *)lyr3h;
+		if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) {
+			i6addr = &ndmh->target;
+			addr = ipvlan_ht_addr_lookup(port, i6addr, true);
+		}
+	} else if (addr_type == IPVL_IPV4) {
+		struct iphdr *ip4h;
+		__be32 *i4addr;
+
+		ip4h = (struct iphdr *)lyr3h;
+		i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr;
+		addr = ipvlan_ht_addr_lookup(port, i4addr, false);
+	} else if (addr_type == IPVL_ARP) {
+		struct arphdr *arph;
+		unsigned char *arp_ptr;
+		__be32 dip;
+
+		arph = (struct arphdr *)lyr3h;
+		arp_ptr = (unsigned char *)(arph + 1);
+		if (use_dest)
+			arp_ptr += (2 * port->dev->addr_len) + 4;
+		else
+			arp_ptr += port->dev->addr_len;
+
+		memcpy(&dip, arp_ptr, 4);
+		addr = ipvlan_ht_addr_lookup(port, &dip, false);
+	}
+
+	return addr;
+}
+
+static int ipvlan_process_v4_outbound(struct sk_buff *skb)
+{
+	const struct iphdr *ip4h = ip_hdr(skb);
+	struct net_device *dev = skb->dev;
+	struct rtable *rt;
+	int err, ret = NET_XMIT_DROP;
+	struct flowi4 fl4 = {
+		.flowi4_oif = dev->iflink,
+		.flowi4_tos = RT_TOS(ip4h->tos),
+		.flowi4_flags = FLOWI_FLAG_ANYSRC,
+		.daddr = ip4h->daddr,
+		.saddr = ip4h->saddr,
+	};
+
+	rt = ip_route_output_flow(dev_net(dev), &fl4, NULL);
+	if (IS_ERR(rt))
+		goto err;
+
+	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+		ip_rt_put(rt);
+		goto err;
+	}
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	err = ip_local_out(skb);
+	if (unlikely(net_xmit_eval(err)))
+		dev->stats.tx_errors++;
+	else
+		ret = NET_XMIT_SUCCESS;
+	goto out;
+err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+out:
+	return ret;
+}
+
+static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct net_device *dev = skb->dev;
+	struct dst_entry *dst;
+	int err, ret = NET_XMIT_DROP;
+	struct flowi6 fl6 = {
+		.flowi6_iif = skb->dev->ifindex,
+		.daddr = ip6h->daddr,
+		.saddr = ip6h->saddr,
+		.flowi6_flags = FLOWI_FLAG_ANYSRC,
+		.flowlabel = ip6_flowinfo(ip6h),
+		.flowi6_mark = skb->mark,
+		.flowi6_proto = ip6h->nexthdr,
+	};
+
+	dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+	if (IS_ERR(dst))
+		goto err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+	err = ip6_local_out(skb);
+	if (unlikely(net_xmit_eval(err)))
+		dev->stats.tx_errors++;
+	else
+		ret = NET_XMIT_SUCCESS;
+	goto out;
+err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+out:
+	return ret;
+}
+
+static int ipvlan_process_outbound(struct sk_buff *skb,
+				   const struct ipvl_dev *ipvlan)
+{
+	struct ethhdr *ethh = eth_hdr(skb);
+	int ret = NET_XMIT_DROP;
+
+	/* In this mode we dont care about multicast and broadcast traffic */
+	if (is_multicast_ether_addr(ethh->h_dest)) {
+		pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
+				    ntohs(skb->protocol));
+		kfree_skb(skb);
+		goto out;
+	}
+
+	/* The ipvlan is a pseudo-L2 device, so the packets that we receive
+	 * will have L2; which need to discarded and processed further
+	 * in the net-ns of the main-device.
+	 */
+	if (skb_mac_header_was_set(skb)) {
+		skb_pull(skb, sizeof(*ethh));
+		skb->mac_header = (typeof(skb->mac_header))~0U;
+		skb_reset_network_header(skb);
+	}
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		ret = ipvlan_process_v6_outbound(skb);
+	else if (skb->protocol == htons(ETH_P_IP))
+		ret = ipvlan_process_v4_outbound(skb);
+	else {
+		pr_warn_ratelimited("Dropped outbound packet type=%x\n",
+				    ntohs(skb->protocol));
+		kfree_skb(skb);
+	}
+out:
+	return ret;
+}
+
+static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	void *lyr3h;
+	struct ipvl_addr *addr;
+	int addr_type;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+	if (addr)
+		return ipvlan_rcv_frame(addr, skb, true);
+
+out:
+	skb->dev = ipvlan->phy_dev;
+	return ipvlan_process_outbound(skb, ipvlan);
+}
+
+static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_addr *addr;
+	void *lyr3h;
+	int addr_type;
+
+	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+		if (lyr3h) {
+			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+			if (addr)
+				return ipvlan_rcv_frame(addr, skb, true);
+		}
+		skb = skb_share_check(skb, GFP_ATOMIC);
+		if (!skb)
+			return NET_XMIT_DROP;
+
+		/* Packet definitely does not belong to any of the
+		 * virtual devices, but the dest is local. So forward
+		 * the skb for the main-dev. At the RX side we just return
+		 * RX_PASS for it to be processed further on the stack.
+		 */
+		return dev_forward_skb(ipvlan->phy_dev, skb);
+
+	} else if (is_multicast_ether_addr(eth->h_dest)) {
+		u8 ip_summed = skb->ip_summed;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
+		skb->ip_summed = ip_summed;
+	}
+
+	skb->dev = ipvlan->phy_dev;
+	return dev_queue_xmit(skb);
+}
+
+int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev);
+
+	if (!port)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+		goto out;
+
+	switch(port->mode) {
+	case IPVLAN_MODE_L2:
+		return ipvlan_xmit_mode_l2(skb, dev);
+	case IPVLAN_MODE_L3:
+		return ipvlan_xmit_mode_l3(skb, dev);
+	}
+
+	/* Should not reach here */
+	WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
+			  port->mode);
+out:
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+}
+
+static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_addr *addr;
+	void *lyr3h;
+	int addr_type;
+
+	if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) {
+		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+		if (!lyr3h)
+			return true;
+
+		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false);
+		if (addr)
+			return false;
+	}
+
+	return true;
+}
+
+static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
+						 struct ipvl_port *port)
+{
+	void *lyr3h;
+	int addr_type;
+	struct ipvl_addr *addr;
+	struct sk_buff *skb = *pskb;
+	rx_handler_result_t ret = RX_HANDLER_PASS;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+	if (addr)
+		ret = ipvlan_rcv_frame(addr, skb, false);
+
+out:
+	return ret;
+}
+
+static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
+						 struct ipvl_port *port)
+{
+	struct sk_buff *skb = *pskb;
+	struct ethhdr *eth = eth_hdr(skb);
+	rx_handler_result_t ret = RX_HANDLER_PASS;
+	void *lyr3h;
+	int addr_type;
+
+	if (is_multicast_ether_addr(eth->h_dest)) {
+		if (ipvlan_external_frame(skb, port))
+			ipvlan_multicast_frame(port, skb, NULL, false);
+	} else {
+		struct ipvl_addr *addr;
+
+		lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+		if (!lyr3h)
+			return ret;
+
+		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+		if (addr)
+			ret = ipvlan_rcv_frame(addr, skb, false);
+	}
+
+	return ret;
+}
+
+rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);
+
+	if (!port)
+		return RX_HANDLER_PASS;
+
+	switch (port->mode) {
+	case IPVLAN_MODE_L2:
+		return ipvlan_handle_mode_l2(pskb, port);
+	case IPVLAN_MODE_L3:
+		return ipvlan_handle_mode_l3(pskb, port);
+	}
+
+	/* Should not reach here */
+	WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
+			  port->mode);
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
new file mode 100644
index 000000000000..c3df84bd2857
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -0,0 +1,789 @@
+/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#include "ipvlan.h"
+
+void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
+{
+	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
+}
+
+void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval)
+{
+	struct ipvl_dev *ipvlan;
+
+	if (port->mode != nval) {
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			if (nval == IPVLAN_MODE_L3)
+				ipvlan->dev->flags |= IFF_NOARP;
+			else
+				ipvlan->dev->flags &= ~IFF_NOARP;
+		}
+		port->mode = nval;
+	}
+}
+
+static int ipvlan_port_create(struct net_device *dev)
+{
+	struct ipvl_port *port;
+	int err, idx;
+
+	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) {
+		netdev_err(dev, "Master is either lo or non-ether device\n");
+		return -EINVAL;
+	}
+	port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	port->dev = dev;
+	port->mode = IPVLAN_MODE_L3;
+	INIT_LIST_HEAD(&port->ipvlans);
+	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
+		INIT_HLIST_HEAD(&port->hlhead[idx]);
+
+	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
+	if (err)
+		goto err;
+
+	dev->priv_flags |= IFF_IPVLAN_MASTER;
+	return 0;
+
+err:
+	kfree_rcu(port, rcu);
+	return err;
+}
+
+static void ipvlan_port_destroy(struct net_device *dev)
+{
+	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
+
+	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
+	netdev_rx_handler_unregister(dev);
+	kfree_rcu(port, rcu);
+}
+
+/* ipvlan network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key ipvlan_netdev_xmit_lock_key;
+static struct lock_class_key ipvlan_netdev_addr_lock_key;
+
+#define IPVLAN_FEATURES \
+	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
+	 NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \
+	 NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
+	 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
+
+#define IPVLAN_STATE_MASK \
+	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
+
+static void ipvlan_set_lockdep_class_one(struct net_device *dev,
+					 struct netdev_queue *txq,
+					 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &ipvlan_netdev_xmit_lock_key);
+}
+
+static void ipvlan_set_lockdep_class(struct net_device *dev)
+{
+	lockdep_set_class(&dev->addr_list_lock, &ipvlan_netdev_addr_lock_key);
+	netdev_for_each_tx_queue(dev, ipvlan_set_lockdep_class_one, NULL);
+}
+
+static int ipvlan_init(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	const struct net_device *phy_dev = ipvlan->phy_dev;
+
+	dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
+		     (phy_dev->state & IPVLAN_STATE_MASK);
+	dev->features = phy_dev->features & IPVLAN_FEATURES;
+	dev->features |= NETIF_F_LLTX;
+	dev->gso_max_size = phy_dev->gso_max_size;
+	dev->iflink = phy_dev->ifindex;
+	dev->hard_header_len = phy_dev->hard_header_len;
+
+	ipvlan_set_lockdep_class(dev);
+
+	ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats);
+	if (!ipvlan->pcpu_stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ipvlan_uninit(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan->port;
+
+	if (ipvlan->pcpu_stats)
+		free_percpu(ipvlan->pcpu_stats);
+
+	port->count -= 1;
+	if (!port->count)
+		ipvlan_port_destroy(port->dev);
+}
+
+static int ipvlan_open(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+	struct ipvl_addr *addr;
+
+	if (ipvlan->port->mode == IPVLAN_MODE_L3)
+		dev->flags |= IFF_NOARP;
+	else
+		dev->flags &= ~IFF_NOARP;
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry(addr, &ipvlan->addrs, anode)
+			ipvlan_ht_addr_add(ipvlan, addr);
+	}
+	return dev_uc_add(phy_dev, phy_dev->dev_addr);
+}
+
+static int ipvlan_stop(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+	struct ipvl_addr *addr;
+
+	dev_uc_unsync(phy_dev, dev);
+	dev_mc_unsync(phy_dev, dev);
+
+	dev_uc_del(phy_dev, phy_dev->dev_addr);
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry(addr, &ipvlan->addrs, anode)
+			ipvlan_ht_addr_del(addr, !dev->dismantle);
+	}
+	return 0;
+}
+
+netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	int skblen = skb->len;
+	int ret;
+
+	ret = ipvlan_queue_xmit(skb, dev);
+	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+		struct ipvl_pcpu_stats *pcptr;
+
+		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
+
+		u64_stats_update_begin(&pcptr->syncp);
+		pcptr->tx_pkts++;
+		pcptr->tx_bytes += skblen;
+		u64_stats_update_end(&pcptr->syncp);
+	} else {
+		this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
+	}
+	return ret;
+}
+
+static netdev_features_t ipvlan_fix_features(struct net_device *dev,
+					     netdev_features_t features)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES);
+}
+
+static void ipvlan_change_rx_flags(struct net_device *dev, int change)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
+}
+
+static void ipvlan_set_broadcast_mac_filter(struct ipvl_dev *ipvlan, bool set)
+{
+	struct net_device *dev = ipvlan->dev;
+	unsigned int hashbit = ipvlan_mac_hash(dev->broadcast);
+
+	if (set && !test_bit(hashbit, ipvlan->mac_filters))
+		__set_bit(hashbit, ipvlan->mac_filters);
+	else if (!set && test_bit(hashbit, ipvlan->mac_filters))
+		__clear_bit(hashbit, ipvlan->mac_filters);
+}
+
+static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
+		bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
+	} else {
+		struct netdev_hw_addr *ha;
+		DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);
+
+		bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
+		netdev_for_each_mc_addr(ha, dev)
+			__set_bit(ipvlan_mac_hash(ha->addr), mc_filters);
+
+		bitmap_copy(ipvlan->mac_filters, mc_filters,
+			    IPVLAN_MAC_FILTER_SIZE);
+	}
+	dev_uc_sync(ipvlan->phy_dev, dev);
+	dev_mc_sync(ipvlan->phy_dev, dev);
+}
+
+static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
+						    struct rtnl_link_stats64 *s)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	if (ipvlan->pcpu_stats) {
+		struct ipvl_pcpu_stats *pcptr;
+		u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
+		u32 rx_errs = 0, tx_drps = 0;
+		u32 strt;
+		int idx;
+
+		for_each_possible_cpu(idx) {
+			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
+			do {
+				strt= u64_stats_fetch_begin_irq(&pcptr->syncp);
+				rx_pkts = pcptr->rx_pkts;
+				rx_bytes = pcptr->rx_bytes;
+				rx_mcast = pcptr->rx_mcast;
+				tx_pkts = pcptr->tx_pkts;
+				tx_bytes = pcptr->tx_bytes;
+			} while (u64_stats_fetch_retry_irq(&pcptr->syncp,
+							   strt));
+
+			s->rx_packets += rx_pkts;
+			s->rx_bytes += rx_bytes;
+			s->multicast += rx_mcast;
+			s->tx_packets += tx_pkts;
+			s->tx_bytes += tx_bytes;
+
+			/* u32 values are updated without syncp protection. */
+			rx_errs += pcptr->rx_errs;
+			tx_drps += pcptr->tx_drps;
+		}
+		s->rx_errors = rx_errs;
+		s->rx_dropped = rx_errs;
+		s->tx_dropped = tx_drps;
+	}
+	return s;
+}
+
+static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	return vlan_vid_add(phy_dev, proto, vid);
+}
+
+static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+				   u16 vid)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	vlan_vid_del(phy_dev, proto, vid);
+	return 0;
+}
+
+static const struct net_device_ops ipvlan_netdev_ops = {
+	.ndo_init		= ipvlan_init,
+	.ndo_uninit		= ipvlan_uninit,
+	.ndo_open		= ipvlan_open,
+	.ndo_stop		= ipvlan_stop,
+	.ndo_start_xmit		= ipvlan_start_xmit,
+	.ndo_fix_features	= ipvlan_fix_features,
+	.ndo_change_rx_flags	= ipvlan_change_rx_flags,
+	.ndo_set_rx_mode	= ipvlan_set_multicast_mac_filter,
+	.ndo_get_stats64	= ipvlan_get_stats64,
+	.ndo_vlan_rx_add_vid	= ipvlan_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= ipvlan_vlan_rx_kill_vid,
+};
+
+static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
+			      unsigned short type, const void *daddr,
+			      const void *saddr, unsigned len)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	/* TODO Probably use a different field than dev_addr so that the
+	 * mac-address on the virtual device is portable and can be carried
+	 * while the packets use the mac-addr on the physical device.
+	 */
+	return dev_hard_header(skb, phy_dev, type, daddr,
+			       saddr ? : dev->dev_addr, len);
+}
+
+static const struct header_ops ipvlan_header_ops = {
+	.create  	= ipvlan_hard_header,
+	.rebuild	= eth_rebuild_header,
+	.parse		= eth_header_parse,
+	.cache		= eth_header_cache,
+	.cache_update	= eth_header_cache_update,
+};
+
+static int ipvlan_ethtool_get_settings(struct net_device *dev,
+				       struct ethtool_cmd *cmd)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	return __ethtool_get_settings(ipvlan->phy_dev, cmd);
+}
+
+static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
+				       struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
+}
+
+static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	return ipvlan->msg_enable;
+}
+
+static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	ipvlan->msg_enable = value;
+}
+
+static const struct ethtool_ops ipvlan_ethtool_ops = {
+	.get_link	= ethtool_op_get_link,
+	.get_settings	= ipvlan_ethtool_get_settings,
+	.get_drvinfo	= ipvlan_ethtool_get_drvinfo,
+	.get_msglevel	= ipvlan_ethtool_get_msglevel,
+	.set_msglevel	= ipvlan_ethtool_set_msglevel,
+};
+
+static int ipvlan_nl_changelink(struct net_device *dev,
+				struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+
+	if (data && data[IFLA_IPVLAN_MODE]) {
+		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+
+		ipvlan_set_port_mode(port, nmode);
+	}
+	return 0;
+}
+
+static size_t ipvlan_nl_getsize(const struct net_device *dev)
+{
+	return (0
+		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
+		);
+}
+
+static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (data && data[IFLA_IPVLAN_MODE]) {
+		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+
+		if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static int ipvlan_nl_fillinfo(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+	int ret = -EINVAL;
+
+	if (!port)
+		goto err;
+
+	ret = -EMSGSIZE;
+	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
+		goto err;
+
+	return 0;
+
+err:
+	return ret;
+}
+
+static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port;
+	struct net_device *phy_dev;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!phy_dev)
+		return -ENODEV;
+
+	if (ipvlan_dev_slave(phy_dev)) {
+		struct ipvl_dev *tmp = netdev_priv(phy_dev);
+
+		phy_dev = tmp->phy_dev;
+	} else if (!ipvlan_dev_master(phy_dev)) {
+		err = ipvlan_port_create(phy_dev);
+		if (err < 0)
+			return err;
+	}
+
+	port = ipvlan_port_get_rtnl(phy_dev);
+	if (data && data[IFLA_IPVLAN_MODE])
+		port->mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+
+	ipvlan->phy_dev = phy_dev;
+	ipvlan->dev = dev;
+	ipvlan->port = port;
+	ipvlan->sfeatures = IPVLAN_FEATURES;
+	INIT_LIST_HEAD(&ipvlan->addrs);
+	ipvlan->ipv4cnt = 0;
+	ipvlan->ipv6cnt = 0;
+
+	/* TODO Probably put random address here to be presented to the
+	 * world but keep using the physical-dev address for the outgoing
+	 * packets.
+	 */
+	memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN);
+
+	dev->priv_flags |= IFF_IPVLAN_SLAVE;
+
+	port->count += 1;
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto ipvlan_destroy_port;
+
+	err = netdev_upper_dev_link(phy_dev, dev);
+	if (err)
+		goto ipvlan_destroy_port;
+
+	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
+	netif_stacked_transfer_operstate(phy_dev, dev);
+	return 0;
+
+ipvlan_destroy_port:
+	port->count -= 1;
+	if (!port->count)
+		ipvlan_port_destroy(phy_dev);
+
+	return err;
+}
+
+static void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_addr *addr, *next;
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
+			ipvlan_ht_addr_del(addr, !dev->dismantle);
+			list_del_rcu(&addr->anode);
+		}
+	}
+	list_del_rcu(&ipvlan->pnode);
+	unregister_netdevice_queue(dev, head);
+	netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
+}
+
+static void ipvlan_link_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->netdev_ops = &ipvlan_netdev_ops;
+	dev->destructor = free_netdev;
+	dev->header_ops = &ipvlan_header_ops;
+	dev->ethtool_ops = &ipvlan_ethtool_ops;
+	dev->tx_queue_len = 0;
+}
+
+static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
+{
+	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipvlan_link_ops = {
+	.kind		= "ipvlan",
+	.priv_size	= sizeof(struct ipvl_dev),
+
+	.get_size	= ipvlan_nl_getsize,
+	.policy		= ipvlan_nl_policy,
+	.validate	= ipvlan_nl_validate,
+	.fill_info	= ipvlan_nl_fillinfo,
+	.changelink	= ipvlan_nl_changelink,
+	.maxtype	= IFLA_IPVLAN_MAX,
+
+	.setup		= ipvlan_link_setup,
+	.newlink	= ipvlan_link_new,
+	.dellink	= ipvlan_link_delete,
+};
+
+int ipvlan_link_register(struct rtnl_link_ops *ops)
+{
+	return rtnl_link_register(ops);
+}
+
+static int ipvlan_device_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct ipvl_dev *ipvlan, *next;
+	struct ipvl_port *port;
+	LIST_HEAD(lst_kill);
+
+	if (!ipvlan_dev_master(dev))
+		return NOTIFY_DONE;
+
+	port = ipvlan_port_get_rtnl(dev);
+
+	switch (event) {
+	case NETDEV_CHANGE:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+			netif_stacked_transfer_operstate(ipvlan->phy_dev,
+							 ipvlan->dev);
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (dev->reg_state != NETREG_UNREGISTERING)
+			break;
+
+		list_for_each_entry_safe(ipvlan, next, &port->ipvlans,
+					 pnode)
+			ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
+							    &lst_kill);
+		unregister_netdevice_many(&lst_kill);
+		break;
+
+	case NETDEV_FEAT_CHANGE:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			ipvlan->dev->features = dev->features & IPVLAN_FEATURES;
+			ipvlan->dev->gso_max_size = dev->gso_max_size;
+			netdev_features_change(ipvlan->dev);
+		}
+		break;
+
+	case NETDEV_CHANGEMTU:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+			ipvlan_adjust_mtu(ipvlan, dev);
+		break;
+
+	case NETDEV_PRE_TYPE_CHANGE:
+		/* Forbid underlying device to change its type. */
+		return NOTIFY_BAD;
+	}
+	return NOTIFY_DONE;
+}
+
+static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	struct ipvl_addr *addr;
+
+	if (ipvlan_addr_busy(ipvlan, ip6_addr, true)) {
+		netif_err(ipvlan, ifup, ipvlan->dev,
+			  "Failed to add IPv6=%pI6c addr for %s intf\n",
+			  ip6_addr, ipvlan->dev->name);
+		return -EINVAL;
+	}
+	addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
+	if (!addr)
+		return -ENOMEM;
+
+	addr->master = ipvlan;
+	memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr));
+	addr->atype = IPVL_IPV6;
+	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+	ipvlan->ipv6cnt++;
+	ipvlan_ht_addr_add(ipvlan, addr);
+
+	return 0;
+}
+
+static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	struct ipvl_addr *addr;
+
+	addr = ipvlan_ht_addr_lookup(ipvlan->port, ip6_addr, true);
+	if (!addr)
+		return;
+
+	ipvlan_ht_addr_del(addr, true);
+	list_del_rcu(&addr->anode);
+	ipvlan->ipv6cnt--;
+	WARN_ON(ipvlan->ipv6cnt < 0);
+	kfree_rcu(addr, rcu);
+
+	return;
+}
+
+static int ipvlan_addr6_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
+	struct net_device *dev = (struct net_device *)if6->idev->dev;
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	if (!ipvlan_dev_slave(dev))
+		return NOTIFY_DONE;
+
+	if (!ipvlan || !ipvlan->port)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		if (ipvlan_add_addr6(ipvlan, &if6->addr))
+			return NOTIFY_BAD;
+		break;
+
+	case NETDEV_DOWN:
+		ipvlan_del_addr6(ipvlan, &if6->addr);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
+{
+	struct ipvl_addr *addr;
+
+	if (ipvlan_addr_busy(ipvlan, ip4_addr, false)) {
+		netif_err(ipvlan, ifup, ipvlan->dev,
+			  "Failed to add IPv4=%pI4 on %s intf.\n",
+			  ip4_addr, ipvlan->dev->name);
+		return -EINVAL;
+	}
+	addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	addr->master = ipvlan;
+	memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr));
+	addr->atype = IPVL_IPV4;
+	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+	ipvlan->ipv4cnt++;
+	ipvlan_ht_addr_add(ipvlan, addr);
+	ipvlan_set_broadcast_mac_filter(ipvlan, true);
+
+	return 0;
+}
+
+static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
+{
+	struct ipvl_addr *addr;
+
+	addr = ipvlan_ht_addr_lookup(ipvlan->port, ip4_addr, false);
+	if (!addr)
+		return;
+
+	ipvlan_ht_addr_del(addr, true);
+	list_del_rcu(&addr->anode);
+	ipvlan->ipv4cnt--;
+	WARN_ON(ipvlan->ipv4cnt < 0);
+	if (!ipvlan->ipv4cnt)
+	    ipvlan_set_broadcast_mac_filter(ipvlan, false);
+	kfree_rcu(addr, rcu);
+
+	return;
+}
+
+static int ipvlan_addr4_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
+{
+	struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
+	struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct in_addr ip4_addr;
+
+	if (!ipvlan_dev_slave(dev))
+		return NOTIFY_DONE;
+
+	if (!ipvlan || !ipvlan->port)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		ip4_addr.s_addr = if4->ifa_address;
+		if (ipvlan_add_addr4(ipvlan, &ip4_addr))
+			return NOTIFY_BAD;
+		break;
+
+	case NETDEV_DOWN:
+		ip4_addr.s_addr = if4->ifa_address;
+		ipvlan_del_addr4(ipvlan, &ip4_addr);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_addr4_event,
+};
+
+static struct notifier_block ipvlan_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_device_event,
+};
+
+static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_addr6_event,
+};
+
+static int __init ipvlan_init_module(void)
+{
+	int err;
+
+	ipvlan_init_secret();
+	register_netdevice_notifier(&ipvlan_notifier_block);
+	register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+	register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+
+	err = ipvlan_link_register(&ipvlan_link_ops);
+	if (err < 0)
+		goto error;
+
+	return 0;
+error:
+	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+	unregister_netdevice_notifier(&ipvlan_notifier_block);
+	return err;
+}
+
+static void __exit ipvlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&ipvlan_link_ops);
+	unregister_netdevice_notifier(&ipvlan_notifier_block);
+	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+}
+
+module_init(ipvlan_init_module);
+module_exit(ipvlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
+MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
+MODULE_ALIAS_RTNL_LINK("ipvlan");
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5cd508787572..2cb772495f7a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1230,6 +1230,8 @@ enum netdev_priv_flags {
 	IFF_LIVE_ADDR_CHANGE		= 1<<20,
 	IFF_MACVLAN			= 1<<21,
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
+	IFF_IPVLAN_MASTER		= 1<<23,
+	IFF_IPVLAN_SLAVE		= 1<<24,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1255,6 +1257,8 @@ enum netdev_priv_flags {
 #define IFF_LIVE_ADDR_CHANGE		IFF_LIVE_ADDR_CHANGE
 #define IFF_MACVLAN			IFF_MACVLAN
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
+#define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
+#define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 
 /**
  *	struct net_device - The DEVICE structure.
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7072d8325016..36bddc233633 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -330,6 +330,21 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* IPVLAN section */
+enum {
+	IFLA_IPVLAN_UNSPEC,
+	IFLA_IPVLAN_MODE,
+	__IFLA_IPVLAN_MAX
+};
+
+#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1)
+
+enum ipvlan_mode {
+	IPVLAN_MODE_L2 = 0,
+	IPVLAN_MODE_L3,
+	IPVLAN_MODE_MAX
+};
+
 /* VXLAN section */
 enum {
 	IFLA_VXLAN_UNSPEC,
-- 
cgit v1.2.3-58-ga151


From 40af86a40cdcabd48ab9636fe13d6763a7d74bc9 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Thu, 13 Nov 2014 00:30:23 +0100
Subject: NFC: st21nfca: Remove gpio_irq field in static and dts configuration

- phy->gpio_irq is never done out of the request resources.
- irq_of_parse_and_map is already done in the i2c core so client->irq is
already set when entering in st21nfca_hci_i2c_of_request_resources
- In case of static platform configuration client->irq can be set directly
- It simplifies the code a bit.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st21nfca/i2c.c             | 30 +-----------------------------
 include/linux/platform_data/st21nfca.h |  1 -
 2 files changed, 1 insertion(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nfc/st21nfca/i2c.c b/drivers/nfc/st21nfca/i2c.c
index 22bc8c2311c2..44e1c769e335 100644
--- a/drivers/nfc/st21nfca/i2c.c
+++ b/drivers/nfc/st21nfca/i2c.c
@@ -72,7 +72,6 @@ struct st21nfca_i2c_phy {
 	struct nfc_hci_dev *hdev;
 
 	unsigned int gpio_ena;
-	unsigned int gpio_irq;
 	unsigned int irq_polarity;
 
 	struct sk_buff *pending_skb;
@@ -536,15 +535,7 @@ static int st21nfca_hci_i2c_of_request_resources(struct i2c_client *client)
 
 	phy->gpio_ena = gpio;
 
-	/* IRQ */
-	r = irq_of_parse_and_map(pp, 0);
-	if (r < 0) {
-		nfc_err(&client->dev, "Unable to get irq, error: %d\n", r);
-		return r;
-	}
-
-	phy->irq_polarity = irq_get_trigger_type(r);
-	client->irq = r;
+	phy->irq_polarity = irq_get_trigger_type(client->irq);
 
 	return 0;
 }
@@ -560,7 +551,6 @@ static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
 	struct st21nfca_nfc_platform_data *pdata;
 	struct st21nfca_i2c_phy *phy = i2c_get_clientdata(client);
 	int r;
-	int irq;
 
 	pdata = client->dev.platform_data;
 	if (pdata == NULL) {
@@ -569,17 +559,9 @@ static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
 	}
 
 	/* store for later use */
-	phy->gpio_irq = pdata->gpio_irq;
 	phy->gpio_ena = pdata->gpio_ena;
 	phy->irq_polarity = pdata->irq_polarity;
 
-	r = devm_gpio_request_one(&client->dev, phy->gpio_irq, GPIOF_IN,
-				  "wake_up");
-	if (r) {
-		pr_err("%s : gpio_request failed\n", __FILE__);
-		return -ENODEV;
-	}
-
 	if (phy->gpio_ena > 0) {
 		r = devm_gpio_request_one(&client->dev, phy->gpio_ena,
 					  GPIOF_OUT_INIT_HIGH, "clf_enable");
@@ -589,16 +571,6 @@ static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
 		}
 	}
 
-	/* IRQ */
-	irq = gpio_to_irq(phy->gpio_irq);
-	if (irq < 0) {
-		nfc_err(&client->dev,
-				"Unable to get irq number for GPIO %d error %d\n",
-				phy->gpio_irq, r);
-		return -ENODEV;
-	}
-	client->irq = irq;
-
 	return 0;
 }
 
diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h
index 1730312398ff..5087fff96d86 100644
--- a/include/linux/platform_data/st21nfca.h
+++ b/include/linux/platform_data/st21nfca.h
@@ -24,7 +24,6 @@
 #define ST21NFCA_HCI_DRIVER_NAME "st21nfca_hci"
 
 struct st21nfca_nfc_platform_data {
-	unsigned int gpio_irq;
 	unsigned int gpio_ena;
 	unsigned int irq_polarity;
 };
-- 
cgit v1.2.3-58-ga151


From a80d0cb6f6addc5a1f3852466fe8d37ca4fe1350 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Thu, 13 Nov 2014 00:30:26 +0100
Subject: NFC: st21nfcb: Remove gpio_irq field in static and dts configuration

- phy->gpio_irq is never done out of the request resources.
- irq_of_parse_and_map is already done in the i2c core so client->irq is
already set when entering in st21nfcb_hci_i2c_of_request_resources
- In case of static platform configuration client->irq can be set directly.
- It simplifies the code a bit.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st21nfcb/i2c.c             | 30 +-----------------------------
 include/linux/platform_data/st21nfcb.h |  1 -
 2 files changed, 1 insertion(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nfc/st21nfcb/i2c.c b/drivers/nfc/st21nfcb/i2c.c
index 73f7ddef57c4..e0c4901936f0 100644
--- a/drivers/nfc/st21nfcb/i2c.c
+++ b/drivers/nfc/st21nfcb/i2c.c
@@ -50,7 +50,6 @@ struct st21nfcb_i2c_phy {
 	struct i2c_client *i2c_dev;
 	struct llt_ndlc *ndlc;
 
-	unsigned int gpio_irq;
 	unsigned int gpio_reset;
 	unsigned int irq_polarity;
 
@@ -262,15 +261,7 @@ static int st21nfcb_nci_i2c_of_request_resources(struct i2c_client *client)
 	}
 	phy->gpio_reset = gpio;
 
-	/* IRQ */
-	r = irq_of_parse_and_map(pp, 0);
-	if (r < 0) {
-		nfc_err(&client->dev, "Unable to get irq, error: %d\n", r);
-		return r;
-	}
-
-	phy->irq_polarity = irq_get_trigger_type(r);
-	client->irq = r;
+	phy->irq_polarity = irq_get_trigger_type(client->irq);
 
 	return 0;
 }
@@ -286,7 +277,6 @@ static int st21nfcb_nci_i2c_request_resources(struct i2c_client *client)
 	struct st21nfcb_nfc_platform_data *pdata;
 	struct st21nfcb_i2c_phy *phy = i2c_get_clientdata(client);
 	int r;
-	int irq;
 
 	pdata = client->dev.platform_data;
 	if (pdata == NULL) {
@@ -295,17 +285,9 @@ static int st21nfcb_nci_i2c_request_resources(struct i2c_client *client)
 	}
 
 	/* store for later use */
-	phy->gpio_irq = pdata->gpio_irq;
 	phy->gpio_reset = pdata->gpio_reset;
 	phy->irq_polarity = pdata->irq_polarity;
 
-	r = devm_gpio_request_one(&client->dev, phy->gpio_irq,
-				GPIOF_IN, "clf_irq");
-	if (r) {
-		pr_err("%s : gpio_request failed\n", __FILE__);
-		return -ENODEV;
-	}
-
 	r = devm_gpio_request_one(&client->dev,
 			phy->gpio_reset, GPIOF_OUT_INIT_HIGH, "clf_reset");
 	if (r) {
@@ -313,16 +295,6 @@ static int st21nfcb_nci_i2c_request_resources(struct i2c_client *client)
 		return -ENODEV;
 	}
 
-	/* IRQ */
-	irq = gpio_to_irq(phy->gpio_irq);
-	if (irq < 0) {
-		nfc_err(&client->dev,
-			"Unable to get irq number for GPIO %d error %d\n",
-			phy->gpio_irq, r);
-		return -ENODEV;
-	}
-	client->irq = irq;
-
 	return 0;
 }
 
diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
index 2d11f1f5efab..c3b432f5b63e 100644
--- a/include/linux/platform_data/st21nfcb.h
+++ b/include/linux/platform_data/st21nfcb.h
@@ -24,7 +24,6 @@
 #define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci"
 
 struct st21nfcb_nfc_platform_data {
-	unsigned int gpio_irq;
 	unsigned int gpio_reset;
 	unsigned int irq_polarity;
 };
-- 
cgit v1.2.3-58-ga151


From f6f6424ba773da6221ecaaa70973eb4dacfa03b2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 28 Nov 2014 14:34:15 +0100
Subject: net: make vid as a parameter for ndo_fdb_add/ndo_fdb_del

Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
u16 vid to drivers from there.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
 drivers/net/macvlan.c                            |  4 +-
 drivers/net/vxlan.c                              |  4 +-
 include/linux/netdevice.h                        |  8 ++--
 include/linux/rtnetlink.h                        |  6 ++-
 net/bridge/br_fdb.c                              | 39 ++----------------
 net/bridge/br_private.h                          |  4 +-
 net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
 10 files changed, 70 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7262077ad547..5ed5e4036dd9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7536,7 +7536,7 @@ static int i40e_get_phys_port_id(struct net_device *netdev,
  */
 static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr,
+			    const unsigned char *addr, u16 vid,
 			    u16 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 52776f9e1f71..74aadeb56ada 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7708,7 +7708,7 @@ static int ixgbe_set_features(struct net_device *netdev,
 
 static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			     struct net_device *dev,
-			     const unsigned char *addr,
+			     const unsigned char *addr, u16 vid,
 			     u16 flags)
 {
 	/* guarantee we can provide a unique filter for the unicast address */
@@ -7717,7 +7717,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			return -ENOMEM;
 	}
 
-	return ndo_dflt_fdb_add(ndm, tb, dev, addr, flags);
+	return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
 static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index a913b3ad2f89..3227c8063edd 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -376,13 +376,14 @@ static int qlcnic_set_mac(struct net_device *netdev, void *p)
 }
 
 static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			struct net_device *netdev, const unsigned char *addr)
+			struct net_device *netdev,
+			const unsigned char *addr, u16 vid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err = -EOPNOTSUPP;
 
 	if (!adapter->fdb_mac_learn)
-		return ndo_dflt_fdb_del(ndm, tb, netdev, addr);
+		return ndo_dflt_fdb_del(ndm, tb, netdev, addr, vid);
 
 	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED) ||
 	    qlcnic_sriov_check(adapter)) {
@@ -401,13 +402,13 @@ static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 
 static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			struct net_device *netdev,
-			const unsigned char *addr, u16 flags)
+			const unsigned char *addr, u16 vid, u16 flags)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err = 0;
 
 	if (!adapter->fdb_mac_learn)
-		return ndo_dflt_fdb_add(ndm, tb, netdev, addr, flags);
+		return ndo_dflt_fdb_add(ndm, tb, netdev, addr, vid, flags);
 
 	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED) &&
 	    !qlcnic_sriov_check(adapter)) {
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 11d4b3506d6e..9538674587aa 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -873,7 +873,7 @@ static int macvlan_vlan_rx_kill_vid(struct net_device *dev,
 
 static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
-			   const unsigned char *addr,
+			   const unsigned char *addr, u16 vid,
 			   u16 flags)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
@@ -898,7 +898,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 
 static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
-			   const unsigned char *addr)
+			   const unsigned char *addr, u16 vid)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	int err = -EINVAL;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d802a1ae4560..31ecb03368c6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -849,7 +849,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 /* Add static entry (via netlink) */
 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			 struct net_device *dev,
-			 const unsigned char *addr, u16 flags)
+			 const unsigned char *addr, u16 vid, u16 flags)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	/* struct net *net = dev_net(vxlan->dev); */
@@ -885,7 +885,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 /* Delete entry (via netlink) */
 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr)
+			    const unsigned char *addr, u16 vid)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2cb772495f7a..589929cf4700 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -951,11 +951,11 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *
  * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
- *		      const unsigned char *addr, u16 flags)
+ *		      const unsigned char *addr, u16 vid, u16 flags)
  *	Adds an FDB entry to dev for addr.
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
- *		      const unsigned char *addr)
+ *		      const unsigned char *addr, u16 vid)
  *	Deletes the FDB entry from dev coresponding to addr.
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  *		       struct net_device *dev, struct net_device *filter_dev,
@@ -1128,11 +1128,13 @@ struct net_device_ops {
 					       struct nlattr *tb[],
 					       struct net_device *dev,
 					       const unsigned char *addr,
+					       u16 vid,
 					       u16 flags);
 	int			(*ndo_fdb_del)(struct ndmsg *ndm,
 					       struct nlattr *tb[],
 					       struct net_device *dev,
-					       const unsigned char *addr);
+					       const unsigned char *addr,
+					       u16 vid);
 	int			(*ndo_fdb_dump)(struct sk_buff *skb,
 						struct netlink_callback *cb,
 						struct net_device *dev,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6cacbce1a06c..063f0f581fe0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -94,11 +94,13 @@ extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
 			    struct nlattr *tb[],
 			    struct net_device *dev,
 			    const unsigned char *addr,
-			     u16 flags);
+			    u16 vid,
+			    u16 flags);
 extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
 			    struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr);
+			    const unsigned char *addr,
+			    u16 vid);
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, u16 mode);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 08ef4e7a2439..b1be971eb06c 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -805,33 +805,17 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
 /* Add new permanent fdb entry with RTM_NEWNEIGH */
 int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	       struct net_device *dev,
-	       const unsigned char *addr, u16 nlh_flags)
+	       const unsigned char *addr, u16 vid, u16 nlh_flags)
 {
 	struct net_bridge_port *p;
 	int err = 0;
 	struct net_port_vlans *pv;
-	unsigned short vid = VLAN_N_VID;
 
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
 		pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
 		return -EINVAL;
 	}
 
-	if (tb[NDA_VLAN]) {
-		if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n");
-			return -EINVAL;
-		}
-
-		vid = nla_get_u16(tb[NDA_VLAN]);
-
-		if (!vid || vid >= VLAN_VID_MASK) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n",
-				vid);
-			return -EINVAL;
-		}
-	}
-
 	if (is_zero_ether_addr(addr)) {
 		pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
 		return -EINVAL;
@@ -845,7 +829,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	pv = nbp_get_vlan_info(p);
-	if (vid != VLAN_N_VID) {
+	if (vid) {
 		if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
 			pr_info("bridge: RTM_NEWNEIGH with unconfigured "
 				"vlan %d on port %s\n", vid, dev->name);
@@ -903,27 +887,12 @@ static int __br_fdb_delete(struct net_bridge_port *p,
 /* Remove neighbor entry with RTM_DELNEIGH */
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev,
-		  const unsigned char *addr)
+		  const unsigned char *addr, u16 vid)
 {
 	struct net_bridge_port *p;
 	int err;
 	struct net_port_vlans *pv;
-	unsigned short vid = VLAN_N_VID;
-
-	if (tb[NDA_VLAN]) {
-		if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n");
-			return -EINVAL;
-		}
 
-		vid = nla_get_u16(tb[NDA_VLAN]);
-
-		if (!vid || vid >= VLAN_VID_MASK) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n",
-				vid);
-			return -EINVAL;
-		}
-	}
 	p = br_port_get_rtnl(dev);
 	if (p == NULL) {
 		pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
@@ -932,7 +901,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	pv = nbp_get_vlan_info(p);
-	if (vid != VLAN_N_VID) {
+	if (vid) {
 		if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
 			pr_info("bridge: RTM_DELNEIGH with unconfigured "
 				"vlan %d on port %s\n", vid, dev->name);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5b370773ad9c..1b529da8234d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,9 +404,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		   const unsigned char *addr, u16 vid, bool added_by_user);
 
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
-		  struct net_device *dev, const unsigned char *addr);
+		  struct net_device *dev, const unsigned char *addr, u16 vid);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
-	       const unsigned char *addr, u16 nlh_flags);
+	       const unsigned char *addr, u16 vid, u16 nlh_flags);
 int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct net_device *dev, struct net_device *fdev, int idx);
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b9b7dfaf202b..1a233c1c8ab4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -36,6 +36,7 @@
 #include <linux/mutex.h>
 #include <linux/if_addr.h>
 #include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
 
@@ -2312,7 +2313,7 @@ errout:
 int ndo_dflt_fdb_add(struct ndmsg *ndm,
 		     struct nlattr *tb[],
 		     struct net_device *dev,
-		     const unsigned char *addr,
+		     const unsigned char *addr, u16 vid,
 		     u16 flags)
 {
 	int err = -EINVAL;
@@ -2338,6 +2339,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_add);
 
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+{
+	u16 vid = 0;
+
+	if (vlan_attr) {
+		if (nla_len(vlan_attr) != sizeof(u16)) {
+			pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+			return -EINVAL;
+		}
+
+		vid = nla_get_u16(vlan_attr);
+
+		if (!vid || vid >= VLAN_VID_MASK) {
+			pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
+				vid);
+			return -EINVAL;
+		}
+	}
+	*p_vid = vid;
+	return 0;
+}
+
 static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
@@ -2345,6 +2368,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct nlattr *tb[NDA_MAX+1];
 	struct net_device *dev;
 	u8 *addr;
+	u16 vid;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2370,6 +2394,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	addr = nla_data(tb[NDA_LLADDR]);
 
+	err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+	if (err)
+		return err;
+
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -2378,7 +2406,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 		const struct net_device_ops *ops = br_dev->netdev_ops;
 
-		err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+		err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+				       nlh->nlmsg_flags);
 		if (err)
 			goto out;
 		else
@@ -2389,9 +2418,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if ((ndm->ndm_flags & NTF_SELF)) {
 		if (dev->netdev_ops->ndo_fdb_add)
 			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+							   vid,
 							   nlh->nlmsg_flags);
 		else
-			err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+			err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
 					       nlh->nlmsg_flags);
 
 		if (!err) {
@@ -2409,7 +2439,7 @@ out:
 int ndo_dflt_fdb_del(struct ndmsg *ndm,
 		     struct nlattr *tb[],
 		     struct net_device *dev,
-		     const unsigned char *addr)
+		     const unsigned char *addr, u16 vid)
 {
 	int err = -EINVAL;
 
@@ -2438,6 +2468,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct net_device *dev;
 	int err = -EINVAL;
 	__u8 *addr;
+	u16 vid;
 
 	if (!netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
@@ -2465,6 +2496,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	addr = nla_data(tb[NDA_LLADDR]);
 
+	err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+	if (err)
+		return err;
+
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -2474,7 +2509,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 		const struct net_device_ops *ops = br_dev->netdev_ops;
 
 		if (ops->ndo_fdb_del)
-			err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+			err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
 
 		if (err)
 			goto out;
@@ -2485,9 +2520,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 	/* Embedded bridge, macvlan, and any other device support */
 	if (ndm->ndm_flags & NTF_SELF) {
 		if (dev->netdev_ops->ndo_fdb_del)
-			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+							   vid);
 		else
-			err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+			err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
 
 		if (!err) {
 			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
-- 
cgit v1.2.3-58-ga151


From 02637fce3e0103ba086b9c33b6d529e69460e4b6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 28 Nov 2014 14:34:16 +0100
Subject: net: rename netdev_phys_port_id to more generic name

So this can be reused for identification of other "items" as well.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Reviewed-by: Thomas Graf <tgraf@suug.ch>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  2 +-
 include/linux/netdevice.h                        | 16 ++++++++--------
 net/core/dev.c                                   |  2 +-
 net/core/net-sysfs.c                             |  2 +-
 net/core/rtnetlink.c                             |  6 +++---
 8 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c4bd025c74c9..336ef3cf5773 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12537,7 +12537,7 @@ static int bnx2x_validate_addr(struct net_device *dev)
 }
 
 static int bnx2x_get_phys_port_id(struct net_device *netdev,
-				  struct netdev_phys_port_id *ppid)
+				  struct netdev_phys_item_id *ppid)
 {
 	struct bnx2x *bp = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5ed5e4036dd9..9ae4270db0b3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7511,7 +7511,7 @@ static void i40e_del_vxlan_port(struct net_device *netdev,
 
 #endif
 static int i40e_get_phys_port_id(struct net_device *netdev,
-				 struct netdev_phys_port_id *ppid)
+				 struct netdev_phys_item_id *ppid)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_pf *pf = np->vsi->back;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b7c99780aef3..1597fb07576c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2259,7 +2259,7 @@ static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_st
 
 #define PORT_ID_BYTE_LEN 8
 static int mlx4_en_get_phys_port_id(struct net_device *dev,
-				    struct netdev_phys_port_id *ppid)
+				    struct netdev_phys_item_id *ppid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_dev *mdev = priv->mdev->dev;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 3227c8063edd..1aa25b13ace1 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -461,7 +461,7 @@ static void qlcnic_82xx_cancel_idc_work(struct qlcnic_adapter *adapter)
 }
 
 static int qlcnic_get_phys_port_id(struct net_device *netdev,
-				   struct netdev_phys_port_id *ppid)
+				   struct netdev_phys_item_id *ppid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_hardware_context *ahw = adapter->ahw;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 589929cf4700..4bd41d72559d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -754,13 +754,13 @@ struct netdev_fcoe_hbainfo {
 };
 #endif
 
-#define MAX_PHYS_PORT_ID_LEN 32
+#define MAX_PHYS_ITEM_ID_LEN 32
 
-/* This structure holds a unique identifier to identify the
- * physical port used by a netdevice.
+/* This structure holds a unique identifier to identify some
+ * physical item (port for example) used by a netdevice.
  */
-struct netdev_phys_port_id {
-	unsigned char id[MAX_PHYS_PORT_ID_LEN];
+struct netdev_phys_item_id {
+	unsigned char id[MAX_PHYS_ITEM_ID_LEN];
 	unsigned char id_len;
 };
 
@@ -976,7 +976,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
  *
  * int (*ndo_get_phys_port_id)(struct net_device *dev,
- *			       struct netdev_phys_port_id *ppid);
+ *			       struct netdev_phys_item_id *ppid);
  *	Called to get ID of physical port of this device. If driver does
  *	not implement this, it is assumed that the hw is not able to have
  *	multiple net devices on single physical port.
@@ -1152,7 +1152,7 @@ struct net_device_ops {
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
 	int			(*ndo_get_phys_port_id)(struct net_device *dev,
-							struct netdev_phys_port_id *ppid);
+							struct netdev_phys_item_id *ppid);
 	void			(*ndo_add_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
 						      __be16 port);
@@ -2870,7 +2870,7 @@ void dev_set_group(struct net_device *, int);
 int dev_set_mac_address(struct net_device *, struct sockaddr *);
 int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
-			 struct netdev_phys_port_id *ppid);
+			 struct netdev_phys_item_id *ppid);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index ac4836241a96..0814a560e5f3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5846,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier);
  *	Get device physical port ID
  */
 int dev_get_phys_port_id(struct net_device *dev,
-			 struct netdev_phys_port_id *ppid)
+			 struct netdev_phys_item_id *ppid)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1a24602cd54e..26c46f4726c5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,7 +404,7 @@ static ssize_t phys_port_id_show(struct device *dev,
 		return restart_syscall();
 
 	if (dev_isalive(netdev)) {
-		struct netdev_phys_port_id ppid;
+		struct netdev_phys_item_id ppid;
 
 		ret = dev_get_phys_port_id(netdev, &ppid);
 		if (!ret)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1a233c1c8ab4..0640f1fbe9dd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -869,7 +869,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
 	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
-	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
+	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_PORT_ID */
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -953,7 +953,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
 static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	int err;
-	struct netdev_phys_port_id ppid;
+	struct netdev_phys_item_id ppid;
 
 	err = dev_get_phys_port_id(dev, &ppid);
 	if (err) {
@@ -1197,7 +1197,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
-	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 };
 
-- 
cgit v1.2.3-58-ga151


From 007f790c8276271de26416f90d55561bcc96588a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 28 Nov 2014 14:34:17 +0100
Subject: net: introduce generic switch devices support

The goal of this is to provide a possibility to support various switch
chips. Drivers should implement relevant ndos to do so. Now there is
only one ndo defined:
- for getting physical switch id is in place.

Note that user can use random port netdevice to access the switch.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Reviewed-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/switchdev.txt | 59 ++++++++++++++++++++++++++++++++++
 MAINTAINERS                            |  7 ++++
 include/linux/netdevice.h              | 10 ++++++
 include/net/switchdev.h                | 30 +++++++++++++++++
 net/Kconfig                            |  1 +
 net/Makefile                           |  3 ++
 net/switchdev/Kconfig                  | 13 ++++++++
 net/switchdev/Makefile                 |  5 +++
 net/switchdev/switchdev.c              | 33 +++++++++++++++++++
 9 files changed, 161 insertions(+)
 create mode 100644 Documentation/networking/switchdev.txt
 create mode 100644 include/net/switchdev.h
 create mode 100644 net/switchdev/Kconfig
 create mode 100644 net/switchdev/Makefile
 create mode 100644 net/switchdev/switchdev.c

(limited to 'include/linux')

diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
new file mode 100644
index 000000000000..f981a9295a39
--- /dev/null
+++ b/Documentation/networking/switchdev.txt
@@ -0,0 +1,59 @@
+Switch (and switch-ish) device drivers HOWTO
+===========================
+
+Please note that the word "switch" is here used in very generic meaning.
+This include devices supporting L2/L3 but also various flow offloading chips,
+including switches embedded into SR-IOV NICs.
+
+Lets describe a topology a bit. Imagine the following example:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       port1 port2 port3 port4 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  NIC0 NIC1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+In this example, there are two independent lines between the switch silicon
+and CPU. NIC0 and NIC1 drivers are not aware of a switch presence. They are
+separate from the switch driver. SOME switch chip is by managed by a driver
+via PCI-E device MNGMNT. Note that MNGMNT device, NIC0 and NIC1 may be
+connected to some other type of bus.
+
+Now, for the previous example show the representation in kernel:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       sw0p0 sw0p1 sw0p2 sw0p3 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  eth0 eth1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+Lets call the example switch driver for SOME switch chip "SOMEswitch". This
+driver takes care of PCI-E device MNGMNT. There is a netdevice instance sw0pX
+created for each port of a switch. These netdevices are instances
+of "SOMEswitch" driver. sw0pX netdevices serve as a "representation"
+of the switch chip. eth0 and eth1 are instances of some other existing driver.
+
+The only difference of the switch-port netdevice from the ordinary netdevice
+is that is implements couple more NDOs:
+
+  ndo_switch_parent_id_get - This returns the same ID for two port netdevices
+			     of the same physical switch chip. This is
+			     mandatory to be implemented by all switch drivers
+			     and serves the caller for recognition of a port
+			     netdevice.
+  ndo_switch_parent_* - Functions that serve for a manipulation of the switch
+			chip itself (it can be though of as a "parent" of the
+			port, therefore the name). They are not port-specific.
+			Caller might use arbitrary port netdevice of the same
+			switch and it will make no difference.
+  ndo_switch_port_* - Functions that serve for a port-specific manipulation.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6b880deae3d2..3aba0ac2d5a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9059,6 +9059,13 @@ F:	lib/swiotlb.c
 F:	arch/*/kernel/pci-swiotlb.c
 F:	include/linux/swiotlb.h
 
+SWITCHDEV
+M:	Jiri Pirko <jiri@resnulli.us>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/switchdev/
+F:	include/net/switchdev.h
+
 SYNOPSYS ARC ARCHITECTURE
 M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4bd41d72559d..3603f31e78f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1018,6 +1018,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	performing GSO on a packet. The device returns true if it is
  *	able to GSO the packet, false otherwise. If the return value is
  *	false the stack will do software GSO.
+ *
+ * int (*ndo_switch_parent_id_get)(struct net_device *dev,
+ *				   struct netdev_phys_item_id *psid);
+ *	Called to get an ID of the switch chip this port is part of.
+ *	If driver implements this, it indicates that it represents a port
+ *	of a switch chip.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1171,6 +1177,10 @@ struct net_device_ops {
 	int			(*ndo_get_lock_subclass)(struct net_device *dev);
 	bool			(*ndo_gso_check) (struct sk_buff *skb,
 						  struct net_device *dev);
+#ifdef CONFIG_NET_SWITCHDEV
+	int			(*ndo_switch_parent_id_get)(struct net_device *dev,
+							    struct netdev_phys_item_id *psid);
+#endif
 };
 
 /**
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
new file mode 100644
index 000000000000..7a52360a1446
--- /dev/null
+++ b/include/net/switchdev.h
@@ -0,0 +1,30 @@
+/*
+ * include/net/switchdev.h - Switch device API
+ * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _LINUX_SWITCHDEV_H_
+#define _LINUX_SWITCHDEV_H_
+
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NET_SWITCHDEV
+
+int netdev_switch_parent_id_get(struct net_device *dev,
+				struct netdev_phys_item_id *psid);
+
+#else
+
+static inline int netdev_switch_parent_id_get(struct net_device *dev,
+					      struct netdev_phys_item_id *psid)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
+#endif /* _LINUX_SWITCHDEV_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index 99815b5454bf..ff9ffc17fa0e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -228,6 +228,7 @@ source "net/vmw_vsock/Kconfig"
 source "net/netlink/Kconfig"
 source "net/mpls/Kconfig"
 source "net/hsr/Kconfig"
+source "net/switchdev/Kconfig"
 
 config RPS
 	boolean
diff --git a/net/Makefile b/net/Makefile
index 7ed1970074b0..95fc694e4ddc 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -73,3 +73,6 @@ obj-$(CONFIG_OPENVSWITCH)	+= openvswitch/
 obj-$(CONFIG_VSOCKETS)	+= vmw_vsock/
 obj-$(CONFIG_NET_MPLS_GSO)	+= mpls/
 obj-$(CONFIG_HSR)		+= hsr/
+ifneq ($(CONFIG_NET_SWITCHDEV),)
+obj-y				+= switchdev/
+endif
diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig
new file mode 100644
index 000000000000..155754588fd6
--- /dev/null
+++ b/net/switchdev/Kconfig
@@ -0,0 +1,13 @@
+#
+# Configuration for Switch device support
+#
+
+config NET_SWITCHDEV
+	boolean "Switch (and switch-ish) device support (EXPERIMENTAL)"
+	depends on INET
+	---help---
+	  This module provides glue between core networking code and device
+	  drivers in order to support hardware switch chips in very generic
+	  meaning of the word "switch". This include devices supporting L2/L3 but
+	  also various flow offloading chips, including switches embedded into
+	  SR-IOV NICs.
diff --git a/net/switchdev/Makefile b/net/switchdev/Makefile
new file mode 100644
index 000000000000..5ed63ed324d0
--- /dev/null
+++ b/net/switchdev/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Switch device API
+#
+
+obj-$(CONFIG_NET_SWITCHDEV) += switchdev.o
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
new file mode 100644
index 000000000000..66973deaae56
--- /dev/null
+++ b/net/switchdev/switchdev.c
@@ -0,0 +1,33 @@
+/*
+ * net/switchdev/switchdev.c - Switch device API
+ * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <net/switchdev.h>
+
+/**
+ *	netdev_switch_parent_id_get - Get ID of a switch
+ *	@dev: port device
+ *	@psid: switch ID
+ *
+ *	Get ID of a switch this port is part of.
+ */
+int netdev_switch_parent_id_get(struct net_device *dev,
+				struct netdev_phys_item_id *psid)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_switch_parent_id_get)
+		return -EOPNOTSUPP;
+	return ops->ndo_switch_parent_id_get(dev, psid);
+}
+EXPORT_SYMBOL(netdev_switch_parent_id_get);
-- 
cgit v1.2.3-58-ga151


From 38dcf357aed299186ecb090cc2f5290cc17d637d Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Fri, 28 Nov 2014 14:34:20 +0100
Subject: bridge: call netdev_sw_port_stp_update when bridge port STP status
 changes

To notify switch driver of change in STP state of bridge port, add new
.ndo op and provide switchdev wrapper func to call ndo op. Use it in bridge
code then.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  5 +++++
 include/net/switchdev.h   |  7 +++++++
 net/bridge/br_stp.c       |  7 +++++++
 net/switchdev/switchdev.c | 19 +++++++++++++++++++
 4 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3603f31e78f3..29c92ee9ed56 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1024,6 +1024,9 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	Called to get an ID of the switch chip this port is part of.
  *	If driver implements this, it indicates that it represents a port
  *	of a switch chip.
+ * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state);
+ *	Called to notify switch device port of bridge port STP
+ *	state change.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1180,6 +1183,8 @@ struct net_device_ops {
 #ifdef CONFIG_NET_SWITCHDEV
 	int			(*ndo_switch_parent_id_get)(struct net_device *dev,
 							    struct netdev_phys_item_id *psid);
+	int			(*ndo_switch_port_stp_update)(struct net_device *dev,
+							      u8 state);
 #endif
 };
 
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 7a52360a1446..8a6d1641fd9b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -16,6 +16,7 @@
 
 int netdev_switch_parent_id_get(struct net_device *dev,
 				struct netdev_phys_item_id *psid);
+int netdev_switch_port_stp_update(struct net_device *dev, u8 state);
 
 #else
 
@@ -25,6 +26,12 @@ static inline int netdev_switch_parent_id_get(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int netdev_switch_port_stp_update(struct net_device *dev,
+						u8 state)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _LINUX_SWITCHDEV_H_ */
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 2b047bcf42a4..fb3ebe615513 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -12,6 +12,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/rculist.h>
+#include <net/switchdev.h>
 
 #include "br_private.h"
 #include "br_private_stp.h"
@@ -38,7 +39,13 @@ void br_log_state(const struct net_bridge_port *p)
 
 void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
+	int err;
+
 	p->state = state;
+	err = netdev_switch_port_stp_update(p->dev, state);
+	if (err && err != -EOPNOTSUPP)
+		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
+				(unsigned int) p->port_no, p->dev->name);
 }
 
 /* called under bridge lock */
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 66973deaae56..d162b21b14bd 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -31,3 +31,22 @@ int netdev_switch_parent_id_get(struct net_device *dev,
 	return ops->ndo_switch_parent_id_get(dev, psid);
 }
 EXPORT_SYMBOL(netdev_switch_parent_id_get);
+
+/**
+ *	netdev_switch_port_stp_update - Notify switch device port of STP
+ *					state change
+ *	@dev: port device
+ *	@state: port STP state
+ *
+ *	Notify switch device port of bridge port STP state change.
+ */
+int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_switch_port_stp_update)
+		return -EOPNOTSUPP;
+	WARN_ON(!ops->ndo_switch_parent_id_get);
+	return ops->ndo_switch_port_stp_update(dev, state);
+}
+EXPORT_SYMBOL(netdev_switch_port_stp_update);
-- 
cgit v1.2.3-58-ga151


From cf6b8e1eedffd9ef9a22c0c9453d752b07daf89a Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Fri, 28 Nov 2014 14:34:21 +0100
Subject: bridge: add API to notify bridge driver of learned FBD on offloaded
 device

When the swdev device learns a new mac/vlan on a port, it sends some async
notification to the driver and the driver installs an FDB in the device.
To give a holistic system view, the learned mac/vlan should be reflected
in the bridge's FBD table, so the user, using normal iproute2 cmds, can view
what is currently learned by the device.  This API on the bridge driver gives
a way for the swdev driver to install an FBD entry in the bridge FBD table.
(And remove one).

This is equivalent to the device running these cmds:

  bridge fdb [add|del] <mac> dev <dev> vid <vlan id> master

This patch needs some extra eyeballs for review, in paricular around the
locking and contexts.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h      | 18 +++++++++
 include/uapi/linux/neighbour.h |  1 +
 net/bridge/br_fdb.c            | 91 +++++++++++++++++++++++++++++++++++++++++-
 net/bridge/br_private.h        |  3 +-
 4 files changed, 111 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 808dcb8cc04f..fa2eca625129 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -37,6 +37,24 @@ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __use
 typedef int br_should_route_hook_t(struct sk_buff *skb);
 extern br_should_route_hook_t __rcu *br_should_route_hook;
 
+#if IS_ENABLED(CONFIG_BRIDGE)
+int br_fdb_external_learn_add(struct net_device *dev,
+			      const unsigned char *addr, u16 vid);
+int br_fdb_external_learn_del(struct net_device *dev,
+			      const unsigned char *addr, u16 vid);
+#else
+static inline int br_fdb_external_learn_add(struct net_device *dev,
+					    const unsigned char *addr, u16 vid)
+{
+	return 0;
+}
+static inline int br_fdb_external_learn_del(struct net_device *dev,
+					    const unsigned char *addr, u16 vid)
+{
+	return 0;
+}
+#endif
+
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
 int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 2f043af7fcd6..f3d77f9f1e0b 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -38,6 +38,7 @@ enum {
 #define NTF_SELF	0x02
 #define NTF_MASTER	0x04
 #define NTF_PROXY	0x08	/* == ATF_PUBL */
+#define NTF_EXT_LEARNED	0x10
 #define NTF_ROUTER	0x80
 
 /*
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b1be971eb06c..cc36e59db7d7 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -481,6 +481,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
 		fdb->is_local = 0;
 		fdb->is_static = 0;
 		fdb->added_by_user = 0;
+		fdb->added_by_external_learn = 0;
 		fdb->updated = fdb->used = jiffies;
 		hlist_add_head_rcu(&fdb->hlist, head);
 	}
@@ -613,7 +614,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
 	ndm->ndm_family	 = AF_BRIDGE;
 	ndm->ndm_pad1    = 0;
 	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags	 = 0;
+	ndm->ndm_flags	 = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0;
 	ndm->ndm_type	 = 0;
 	ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
 	ndm->ndm_state   = fdb_to_nud(fdb);
@@ -983,3 +984,91 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 		}
 	}
 }
+
+int br_fdb_external_learn_add(struct net_device *dev,
+			      const unsigned char *addr, u16 vid)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct hlist_head *head;
+	struct net_bridge_fdb_entry *fdb;
+	int err = 0;
+
+	rtnl_lock();
+
+	p = br_port_get_rtnl(dev);
+	if (!p) {
+		pr_info("bridge: %s not a bridge port\n", dev->name);
+		err = -EINVAL;
+		goto err_rtnl_unlock;
+	}
+
+	br = p->br;
+
+	spin_lock_bh(&br->hash_lock);
+
+	head = &br->hash[br_mac_hash(addr, vid)];
+	fdb = fdb_find(head, addr, vid);
+	if (!fdb) {
+		fdb = fdb_create(head, p, addr, vid);
+		if (!fdb) {
+			err = -ENOMEM;
+			goto err_unlock;
+		}
+		fdb->added_by_external_learn = 1;
+		fdb_notify(br, fdb, RTM_NEWNEIGH);
+	} else if (fdb->added_by_external_learn) {
+		/* Refresh entry */
+		fdb->updated = fdb->used = jiffies;
+	} else if (!fdb->added_by_user) {
+		/* Take over SW learned entry */
+		fdb->added_by_external_learn = 1;
+		fdb->updated = jiffies;
+		fdb_notify(br, fdb, RTM_NEWNEIGH);
+	}
+
+err_unlock:
+	spin_unlock_bh(&br->hash_lock);
+err_rtnl_unlock:
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(br_fdb_external_learn_add);
+
+int br_fdb_external_learn_del(struct net_device *dev,
+			      const unsigned char *addr, u16 vid)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct hlist_head *head;
+	struct net_bridge_fdb_entry *fdb;
+	int err = 0;
+
+	rtnl_lock();
+
+	p = br_port_get_rtnl(dev);
+	if (!p) {
+		pr_info("bridge: %s not a bridge port\n", dev->name);
+		err = -EINVAL;
+		goto err_rtnl_unlock;
+	}
+
+	br = p->br;
+
+	spin_lock_bh(&br->hash_lock);
+
+	head = &br->hash[br_mac_hash(addr, vid)];
+	fdb = fdb_find(head, addr, vid);
+	if (fdb && fdb->added_by_external_learn)
+		fdb_delete(br, fdb);
+	else
+		err = -ENOENT;
+
+	spin_unlock_bh(&br->hash_lock);
+err_rtnl_unlock:
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(br_fdb_external_learn_del);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b529da8234d..cc36fb3efbdd 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -100,7 +100,8 @@ struct net_bridge_fdb_entry
 	mac_addr			addr;
 	unsigned char			is_local:1,
 					is_static:1,
-					added_by_user:1;
+					added_by_user:1,
+					added_by_external_learn:1;
 	__u16				vlan_id;
 };
 
-- 
cgit v1.2.3-58-ga151


From 065c212a9e25172069f368b36228379521dadb65 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Fri, 28 Nov 2014 14:34:22 +0100
Subject: bridge: move private brport flags to if_bridge.h so port drivers can
 use flags

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 12 ++++++++++++
 net/bridge/br_private.h   | 10 ----------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index fa2eca625129..2c81a8efd24d 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -15,6 +15,7 @@
 
 #include <linux/netdevice.h>
 #include <uapi/linux/if_bridge.h>
+#include <linux/bitops.h>
 
 struct br_ip {
 	union {
@@ -32,6 +33,17 @@ struct br_ip_list {
 	struct br_ip addr;
 };
 
+#define BR_HAIRPIN_MODE		BIT(0)
+#define BR_BPDU_GUARD		BIT(1)
+#define BR_ROOT_BLOCK		BIT(2)
+#define BR_MULTICAST_FAST_LEAVE	BIT(3)
+#define BR_ADMIN_COST		BIT(4)
+#define BR_LEARNING		BIT(5)
+#define BR_FLOOD		BIT(6)
+#define BR_AUTO_MASK		(BR_FLOOD | BR_LEARNING)
+#define BR_PROMISC		BIT(7)
+#define BR_PROXYARP		BIT(8)
+
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 
 typedef int br_should_route_hook_t(struct sk_buff *skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index cc36fb3efbdd..aea3d1339b3f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -164,16 +164,6 @@ struct net_bridge_port
 	struct rcu_head			rcu;
 
 	unsigned long 			flags;
-#define BR_HAIRPIN_MODE		0x00000001
-#define BR_BPDU_GUARD           0x00000002
-#define BR_ROOT_BLOCK		0x00000004
-#define BR_MULTICAST_FAST_LEAVE	0x00000008
-#define BR_ADMIN_COST		0x00000010
-#define BR_LEARNING		0x00000020
-#define BR_FLOOD		0x00000040
-#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
-#define BR_PROMISC		0x00000080
-#define BR_PROXYARP		0x00000100
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	struct bridge_mcast_own_query	ip4_own_query;
-- 
cgit v1.2.3-58-ga151


From efacacdaf7cb5a0592ed772e3731636b2742e34a Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Fri, 28 Nov 2014 14:34:23 +0100
Subject: bridge: add new brport flag LEARNING_SYNC

This policy flag controls syncing of learned FDB entries to bridge's FDB.  If
on, FDB entries learned on bridge port device will be synced.  If off, device
may still learn new FDB entries but they will not be synced with bridge's FDB.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    | 1 +
 include/uapi/linux/if_link.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 2c81a8efd24d..0a8ce762a47f 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -43,6 +43,7 @@ struct br_ip_list {
 #define BR_AUTO_MASK		(BR_FLOOD | BR_LEARNING)
 #define BR_PROMISC		BIT(7)
 #define BR_PROXYARP		BIT(8)
+#define BR_LEARNING_SYNC	BIT(9)
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 623f1a7c7627..f7d0d2d7173a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -245,6 +245,7 @@ enum {
 	IFLA_BRPORT_LEARNING,	/* mac learning */
 	IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */
 	IFLA_BRPORT_PROXYARP,	/* proxy ARP */
+	IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
-- 
cgit v1.2.3-58-ga151


From 2c3c031c8f8930861815fa1685d7c5e8ccec047c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Fri, 28 Nov 2014 14:34:25 +0100
Subject: bridge: add brport flags to dflt bridge_getlink

To allow brport device to return current brport flags set on port.  Add
returned flags to nested IFLA_PROTINFO netlink msg built in dflt getlink.
With this change, netlink msg returned for bridge_getlink contains the port's
offloaded flag settings (the port's SELF settings).

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c   |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 include/linux/rtnetlink.h                     |  3 ++-
 net/core/rtnetlink.c                          | 39 ++++++++++++++++++++++++++-
 4 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index dc77ec2bdafd..e0ab7673afe7 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4367,7 +4367,8 @@ static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
 				       hsw_mode == PORT_FWD_TYPE_VEPA ?
-				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB);
+				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB,
+				       0, 0);
 }
 
 #ifdef CONFIG_BE2NET_VXLAN
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 74aadeb56ada..9afa167d52a6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7778,7 +7778,7 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	else
 		mode = BRIDGE_MODE_VEPA;
 
-	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode);
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, 0, 0);
 }
 
 static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 063f0f581fe0..3b0419072f88 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -103,5 +103,6 @@ extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
 			    u16 vid);
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-				   struct net_device *dev, u16 mode);
+				   struct net_device *dev, u16 mode,
+				   u32 flags, u32 mask);
 #endif	/* __LINUX_RTNETLINK_H */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index aa5cd2c53a56..61cb7e7cc3c7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2687,12 +2687,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+			       unsigned int attrnum, unsigned int flag)
+{
+	if (mask & flag)
+		return nla_put_u8(skb, attrnum, !!(flags & flag));
+	return 0;
+}
+
 int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-			    struct net_device *dev, u16 mode)
+			    struct net_device *dev, u16 mode,
+			    u32 flags, u32 mask)
 {
 	struct nlmsghdr *nlh;
 	struct ifinfomsg *ifm;
 	struct nlattr *br_afspec;
+	struct nlattr *protinfo;
 	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
 	struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 
@@ -2731,6 +2741,33 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	}
 	nla_nest_end(skb, br_afspec);
 
+	protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+	if (!protinfo)
+		goto nla_put_failure;
+
+	if (brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_FAST_LEAVE,
+				BR_MULTICAST_FAST_LEAVE) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+	    brport_nla_put_flag(skb, flags, mask,
+				IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+		nla_nest_cancel(skb, protinfo);
+		goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, protinfo);
+
 	return nlmsg_end(skb, nlh);
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
-- 
cgit v1.2.3-58-ga151


From 89aa075832b0da4402acebd698d0411dcc82d03e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 1 Dec 2014 15:06:35 -0800
Subject: net: sock: allow eBPF programs to be attached to sockets

introduce new setsockopt() command:

setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))

where prog_fd was received from syscall bpf(BPF_PROG_LOAD, attr, ...)
and attr->prog_type == BPF_PROG_TYPE_SOCKET_FILTER

setsockopt() calls bpf_prog_get() which increments refcnt of the program,
so it doesn't get unloaded while socket is using the program.

The same eBPF program can be attached to multiple sockets.

User task exit automatically closes socket which calls sk_filter_uncharge()
which decrements refcnt of eBPF program

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   |  3 ++
 arch/avr32/include/uapi/asm/socket.h   |  3 ++
 arch/cris/include/uapi/asm/socket.h    |  3 ++
 arch/frv/include/uapi/asm/socket.h     |  3 ++
 arch/ia64/include/uapi/asm/socket.h    |  3 ++
 arch/m32r/include/uapi/asm/socket.h    |  3 ++
 arch/mips/include/uapi/asm/socket.h    |  3 ++
 arch/mn10300/include/uapi/asm/socket.h |  3 ++
 arch/parisc/include/uapi/asm/socket.h  |  3 ++
 arch/powerpc/include/uapi/asm/socket.h |  3 ++
 arch/s390/include/uapi/asm/socket.h    |  3 ++
 arch/sparc/include/uapi/asm/socket.h   |  3 ++
 arch/xtensa/include/uapi/asm/socket.h  |  3 ++
 include/linux/bpf.h                    |  4 ++
 include/linux/filter.h                 |  1 +
 include/uapi/asm-generic/socket.h      |  3 ++
 net/core/filter.c                      | 97 +++++++++++++++++++++++++++++++++-
 net/core/sock.c                        | 13 +++++
 18 files changed, 155 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index e2fe0700b3b4..9a20821b111c 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -89,4 +89,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 92121b0f5b98..2b65ed6b277c 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
index 60f60f5b9b35..e2503d9f1869 100644
--- a/arch/cris/include/uapi/asm/socket.h
+++ b/arch/cris/include/uapi/asm/socket.h
@@ -84,6 +84,9 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index 2c6890209ea6..4823ad125578 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -82,5 +82,8 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 09a93fb566f6..59be3d87f86d 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -91,4 +91,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index e8589819c274..7bc4cb273856 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 2e9ee8c55a10..dec3c850f36b 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -100,4 +100,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index f3492e8c9f70..cab7d6d50051 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -82,4 +82,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 7984a1cab3da..a5cd40cd8ee1 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -81,4 +81,7 @@
 
 #define SO_INCOMING_CPU		0x402A
 
+#define SO_ATTACH_BPF		0x402B
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 3474e4ef166d..c046666038f8 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -89,4 +89,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 8457636c33e1..296942d56e6a 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -88,4 +88,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 4a8003a94163..e6a16c40be5f 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -78,6 +78,9 @@
 
 #define SO_INCOMING_CPU		0x0033
 
+#define SO_ATTACH_BPF		0x0034
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index c46f6a696849..4120af086160 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -93,4 +93,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 75e94eaa228b..bbfceb756452 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -128,7 +128,11 @@ struct bpf_prog_aux {
 	struct work_struct work;
 };
 
+#ifdef CONFIG_BPF_SYSCALL
 void bpf_prog_put(struct bpf_prog *prog);
+#else
+static inline void bpf_prog_put(struct bpf_prog *prog) {}
+#endif
 struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ca95abd2bed1..caac2087a4d5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -381,6 +381,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
+int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
 
 int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index f541ccefd4ac..5c15c2a5c123 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -84,4 +84,7 @@
 
 #define SO_INCOMING_CPU		49
 
+#define SO_ATTACH_BPF		50
+#define SO_DETACH_BPF		SO_DETACH_FILTER
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index 647b12265e18..8cc3c03078b3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -44,6 +44,7 @@
 #include <linux/ratelimit.h>
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
+#include <linux/bpf.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
 
 static void __bpf_prog_release(struct bpf_prog *prog)
 {
-	bpf_release_orig_filter(prog);
-	bpf_prog_free(prog);
+	if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(prog);
+	} else {
+		bpf_release_orig_filter(prog);
+		bpf_prog_free(prog);
+	}
 }
 
 static void __sk_filter_release(struct sk_filter *fp)
@@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_attach_filter);
 
+#ifdef CONFIG_BPF_SYSCALL
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+	struct sk_filter *fp, *old_fp;
+	struct bpf_prog *prog;
+
+	if (sock_flag(sk, SOCK_FILTER_LOCKED))
+		return -EPERM;
+
+	prog = bpf_prog_get(ufd);
+	if (!prog)
+		return -EINVAL;
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		/* valid fd, but invalid program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		bpf_prog_put(prog);
+		return -ENOMEM;
+	}
+	fp->prog = prog;
+
+	atomic_set(&fp->refcnt, 0);
+
+	if (!sk_filter_charge(sk, fp)) {
+		__sk_filter_release(fp);
+		return -ENOMEM;
+	}
+
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   sock_owned_by_user(sk));
+	rcu_assign_pointer(sk->sk_filter, fp);
+
+	if (old_fp)
+		sk_filter_uncharge(sk, old_fp);
+
+	return 0;
+}
+
+/* allow socket filters to call
+ * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
+ */
+static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		return NULL;
+	}
+}
+
+static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* skb fields cannot be accessed yet */
+	return false;
+}
+
+static struct bpf_verifier_ops sock_filter_ops = {
+	.get_func_proto = sock_filter_func_proto,
+	.is_valid_access = sock_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+	.ops = &sock_filter_ops,
+	.type = BPF_PROG_TYPE_SOCKET_FILTER,
+};
+
+static int __init register_sock_filter_ops(void)
+{
+	bpf_register_prog_type(&tl);
+	return 0;
+}
+late_initcall(register_sock_filter_ops);
+#else
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/net/core/sock.c b/net/core/sock.c
index 0725cf0cb685..9a56b2000c3f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -888,6 +888,19 @@ set_rcvbuf:
 		}
 		break;
 
+	case SO_ATTACH_BPF:
+		ret = -EINVAL;
+		if (optlen == sizeof(u32)) {
+			u32 ufd;
+
+			ret = -EFAULT;
+			if (copy_from_user(&ufd, optval, sizeof(ufd)))
+				break;
+
+			ret = sk_attach_bpf(ufd, sk);
+		}
+		break;
+
 	case SO_DETACH_FILTER:
 		ret = sk_detach_filter(sk);
 		break;
-- 
cgit v1.2.3-58-ga151


From bac78aabcfece0c493b2ad824c68fbdc20448cbc Mon Sep 17 00:00:00 2001
From: Andri Yngvason <andri.yngvason@marel.com>
Date: Wed, 3 Dec 2014 17:54:13 +0000
Subject: can: dev: Consolidate and unify state change handling

The handling of can error states is different between platforms.
This is an attempt to correct that problem.

I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).

This new mechanism also adds reverse state transitioning in error
frames, i.e. the user will be notified through the socket interface
when the state goes down.

Signed-off-by: Andri Yngvason <andri.yngvason@marel.com>
Acked-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c          | 78 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h        |  3 ++
 include/uapi/linux/can/error.h |  1 +
 3 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 2cfe5012e4e5..3ec8f6f25e5f 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -273,6 +273,84 @@ static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	return err;
 }
 
+static void can_update_state_error_stats(struct net_device *dev,
+					 enum can_state new_state)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if (new_state <= priv->state)
+		return;
+
+	switch (new_state) {
+	case CAN_STATE_ERROR_WARNING:
+		priv->can_stats.error_warning++;
+		break;
+	case CAN_STATE_ERROR_PASSIVE:
+		priv->can_stats.error_passive++;
+		break;
+	case CAN_STATE_BUS_OFF:
+	default:
+		break;
+	};
+}
+
+static int can_tx_state_to_frame(struct net_device *dev, enum can_state state)
+{
+	switch (state) {
+	case CAN_STATE_ERROR_ACTIVE:
+		return CAN_ERR_CRTL_ACTIVE;
+	case CAN_STATE_ERROR_WARNING:
+		return CAN_ERR_CRTL_TX_WARNING;
+	case CAN_STATE_ERROR_PASSIVE:
+		return CAN_ERR_CRTL_TX_PASSIVE;
+	default:
+		return 0;
+	}
+}
+
+static int can_rx_state_to_frame(struct net_device *dev, enum can_state state)
+{
+	switch (state) {
+	case CAN_STATE_ERROR_ACTIVE:
+		return CAN_ERR_CRTL_ACTIVE;
+	case CAN_STATE_ERROR_WARNING:
+		return CAN_ERR_CRTL_RX_WARNING;
+	case CAN_STATE_ERROR_PASSIVE:
+		return CAN_ERR_CRTL_RX_PASSIVE;
+	default:
+		return 0;
+	}
+}
+
+void can_change_state(struct net_device *dev, struct can_frame *cf,
+		      enum can_state tx_state, enum can_state rx_state)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	enum can_state new_state = max(tx_state, rx_state);
+
+	if (unlikely(new_state == priv->state)) {
+		netdev_warn(dev, "%s: oops, state did not change", __func__);
+		return;
+	}
+
+	netdev_dbg(dev, "New error state: %d\n", new_state);
+
+	can_update_state_error_stats(dev, new_state);
+	priv->state = new_state;
+
+	if (unlikely(new_state == CAN_STATE_BUS_OFF)) {
+		cf->can_id |= CAN_ERR_BUSOFF;
+		return;
+	}
+
+	cf->can_id |= CAN_ERR_CRTL;
+	cf->data[1] |= tx_state >= rx_state ?
+		       can_tx_state_to_frame(dev, tx_state) : 0;
+	cf->data[1] |= tx_state <= rx_state ?
+		       can_rx_state_to_frame(dev, rx_state) : 0;
+}
+EXPORT_SYMBOL_GPL(can_change_state);
+
 /*
  * Local echo of CAN messages
  *
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index b37ea95bc348..c05ff0f9f9a5 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -127,6 +127,9 @@ void unregister_candev(struct net_device *dev);
 int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
+void can_change_state(struct net_device *dev, struct can_frame *cf,
+		      enum can_state tx_state, enum can_state rx_state);
+
 void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		      unsigned int idx);
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index c247446ab25a..1c508be9687f 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -71,6 +71,7 @@
 #define CAN_ERR_CRTL_TX_PASSIVE  0x20 /* reached error passive status TX */
 				      /* (at least one error counter exceeds */
 				      /* the protocol-defined level of 127)  */
+#define CAN_ERR_CRTL_ACTIVE      0x40 /* recovered to error active state */
 
 /* error in CAN protocol (type) / data[2] */
 #define CAN_ERR_PROT_UNSPEC      0x00 /* unspecified */
-- 
cgit v1.2.3-58-ga151


From c3582a2c4d0baf1fa3955c8b3d3d61308df474c7 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 1 Dec 2014 13:28:39 -0800
Subject: hyperv: Add support for vNIC hot removal

This patch adds proper handling of the vNIC hot removal event, which includes
a rescind-channel-offer message from the host side that triggers vNIC close and
removal. In this case, the notices to the host during close and removal is not
necessary because the channel is rescinded. This patch blocks these unnecessary
messages, and lets vNIC removal process complete normally.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hv/channel_mgmt.c         | 2 ++
 drivers/net/hyperv/netvsc.c       | 3 +++
 drivers/net/hyperv/rndis_filter.c | 3 +++
 include/linux/hyperv.h            | 2 ++
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index a2d1a9612c86..191a6a3ae6ca 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -517,6 +517,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		/* Just return here, no channel found */
 		return;
 
+	channel->rescind = true;
+
 	/* work is initialized for vmbus_process_rescind_offer() from
 	 * vmbus_process_offer() where the channel got created */
 	queue_work(channel->controlwq, &channel->work);
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 6fc834e4306d..dd867e6cabd6 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -764,6 +764,9 @@ int netvsc_send(struct hv_device *device,
 		out_channel = device->channel;
 	packet->channel = out_channel;
 
+	if (out_channel->rescind)
+		return -ENODEV;
+
 	if (packet->page_buf_cnt) {
 		ret = vmbus_sendpacket_pagebuffer(out_channel,
 						  packet->page_buf,
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 7b2c5d1e9bad..ec0c40a8f653 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -958,6 +958,9 @@ static int rndis_filter_close_device(struct rndis_device *dev)
 		return 0;
 
 	ret = rndis_filter_set_packet_filter(dev, 0);
+	if (ret == -ENODEV)
+		ret = 0;
+
 	if (ret == 0)
 		dev->state = RNDIS_DEV_INITIALIZED;
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 08cfaff8a072..476c685ca6f9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -650,6 +650,8 @@ struct vmbus_channel {
 	u8 monitor_grp;
 	u8 monitor_bit;
 
+	bool rescind; /* got rescind msg */
+
 	u32 ringbuffer_gpadlhandle;
 
 	/* Allocated memory for ring buffer */
-- 
cgit v1.2.3-58-ga151


From b0ba512e25d729a43858ad1f6cb8b94dbb95dbeb Mon Sep 17 00:00:00 2001
From: Petri Gynther <pgynther@google.com>
Date: Mon, 1 Dec 2014 16:18:08 -0800
Subject: net: bcmgenet: enable driver to work without a device tree

Modify bcmgenet driver so that it can be used on Broadcom 7xxx
MIPS-based STB platforms without a device tree.

Signed-off-by: Petri Gynther <pgynther@google.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/Kconfig          |   1 -
 drivers/net/ethernet/broadcom/genet/bcmgenet.c |  31 ++++--
 drivers/net/ethernet/broadcom/genet/bcmmii.c   | 126 ++++++++++++++++++++-----
 include/linux/platform_data/bcmgenet.h         |  18 ++++
 4 files changed, 143 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/platform_data/bcmgenet.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index c3e260c21734..888247ad9068 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -62,7 +62,6 @@ config BCM63XX_ENET
 
 config BCMGENET
 	tristate "Broadcom GENET internal MAC support"
-	depends on OF
 	select MII
 	select PHYLIB
 	select FIXED_PHY if BCMGENET=y
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index f2fadb053d52..adfef5ca0d55 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -42,6 +42,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/phy.h>
+#include <linux/platform_data/bcmgenet.h>
 
 #include <asm/unaligned.h>
 
@@ -2586,8 +2587,9 @@ static const struct of_device_id bcmgenet_match[] = {
 
 static int bcmgenet_probe(struct platform_device *pdev)
 {
+	struct bcmgenet_platform_data *pd = pdev->dev.platform_data;
 	struct device_node *dn = pdev->dev.of_node;
-	const struct of_device_id *of_id;
+	const struct of_device_id *of_id = NULL;
 	struct bcmgenet_priv *priv;
 	struct net_device *dev;
 	const void *macaddr;
@@ -2601,9 +2603,11 @@ static int bcmgenet_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	of_id = of_match_node(bcmgenet_match, dn);
-	if (!of_id)
-		return -EINVAL;
+	if (dn) {
+		of_id = of_match_node(bcmgenet_match, dn);
+		if (!of_id)
+			return -EINVAL;
+	}
 
 	priv = netdev_priv(dev);
 	priv->irq0 = platform_get_irq(pdev, 0);
@@ -2615,11 +2619,15 @@ static int bcmgenet_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	macaddr = of_get_mac_address(dn);
-	if (!macaddr) {
-		dev_err(&pdev->dev, "can't find MAC address\n");
-		err = -EINVAL;
-		goto err;
+	if (dn) {
+		macaddr = of_get_mac_address(dn);
+		if (!macaddr) {
+			dev_err(&pdev->dev, "can't find MAC address\n");
+			err = -EINVAL;
+			goto err;
+		}
+	} else {
+		macaddr = pd->mac_address;
 	}
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -2659,7 +2667,10 @@ static int bcmgenet_probe(struct platform_device *pdev)
 
 	priv->dev = dev;
 	priv->pdev = pdev;
-	priv->version = (enum bcmgenet_version)of_id->data;
+	if (of_id)
+		priv->version = (enum bcmgenet_version)of_id->data;
+	else
+		priv->version = pd->genet_version;
 
 	priv->clk = devm_clk_get(&priv->pdev->dev, "enet");
 	if (IS_ERR(priv->clk))
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 933cd7e7cd33..446889cc3c6a 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -23,6 +23,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
+#include <linux/platform_data/bcmgenet.h>
 
 #include "bcmgenet.h"
 
@@ -312,22 +313,6 @@ static int bcmgenet_mii_probe(struct net_device *dev)
 	u32 phy_flags;
 	int ret;
 
-	if (priv->phydev) {
-		pr_info("PHY already attached\n");
-		return 0;
-	}
-
-	/* In the case of a fixed PHY, the DT node associated
-	 * to the PHY is the Ethernet MAC DT node.
-	 */
-	if (!priv->phy_dn && of_phy_is_fixed_link(dn)) {
-		ret = of_phy_register_fixed_link(dn);
-		if (ret)
-			return ret;
-
-		priv->phy_dn = of_node_get(dn);
-	}
-
 	/* Communicate the integrated PHY revision */
 	phy_flags = priv->gphy_rev;
 
@@ -337,11 +322,39 @@ static int bcmgenet_mii_probe(struct net_device *dev)
 	priv->old_duplex = -1;
 	priv->old_pause = -1;
 
-	phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
-				phy_flags, priv->phy_interface);
-	if (!phydev) {
-		pr_err("could not attach to PHY\n");
-		return -ENODEV;
+	if (dn) {
+		if (priv->phydev) {
+			pr_info("PHY already attached\n");
+			return 0;
+		}
+
+		/* In the case of a fixed PHY, the DT node associated
+		 * to the PHY is the Ethernet MAC DT node.
+		 */
+		if (!priv->phy_dn && of_phy_is_fixed_link(dn)) {
+			ret = of_phy_register_fixed_link(dn);
+			if (ret)
+				return ret;
+
+			priv->phy_dn = of_node_get(dn);
+		}
+
+		phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup,
+					phy_flags, priv->phy_interface);
+		if (!phydev) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
+	} else {
+		phydev = priv->phydev;
+		phydev->dev_flags = phy_flags;
+
+		ret = phy_connect_direct(dev, phydev, bcmgenet_mii_setup,
+					 priv->phy_interface);
+		if (ret) {
+			pr_err("could not attach to PHY\n");
+			return -ENODEV;
+		}
 	}
 
 	priv->phydev = phydev;
@@ -438,6 +451,75 @@ static int bcmgenet_mii_of_init(struct bcmgenet_priv *priv)
 	return 0;
 }
 
+static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv)
+{
+	struct device *kdev = &priv->pdev->dev;
+	struct bcmgenet_platform_data *pd = kdev->platform_data;
+	struct mii_bus *mdio = priv->mii_bus;
+	struct phy_device *phydev;
+	int ret;
+
+	if (pd->phy_interface != PHY_INTERFACE_MODE_MOCA && pd->mdio_enabled) {
+		/*
+		 * Internal or external PHY with MDIO access
+		 */
+		if (pd->phy_address >= 0 && pd->phy_address < PHY_MAX_ADDR)
+			mdio->phy_mask = ~(1 << pd->phy_address);
+		else
+			mdio->phy_mask = 0;
+
+		ret = mdiobus_register(mdio);
+		if (ret) {
+			dev_err(kdev, "failed to register MDIO bus\n");
+			return ret;
+		}
+
+		if (pd->phy_address >= 0 && pd->phy_address < PHY_MAX_ADDR)
+			phydev = mdio->phy_map[pd->phy_address];
+		else
+			phydev = phy_find_first(mdio);
+
+		if (!phydev) {
+			dev_err(kdev, "failed to register PHY device\n");
+			mdiobus_unregister(mdio);
+			return -ENODEV;
+		}
+	} else {
+		/*
+		 * MoCA port or no MDIO access.
+		 * Use fixed PHY to represent the link layer.
+		 */
+		struct fixed_phy_status fphy_status = {
+			.link = 1,
+			.speed = pd->phy_speed,
+			.duplex = pd->phy_duplex,
+			.pause = 0,
+			.asym_pause = 0,
+		};
+
+		phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL);
+		if (!phydev || IS_ERR(phydev)) {
+			dev_err(kdev, "failed to register fixed PHY device\n");
+			return -ENODEV;
+		}
+	}
+
+	priv->phydev = phydev;
+	priv->phy_interface = pd->phy_interface;
+
+	return 0;
+}
+
+static int bcmgenet_mii_bus_init(struct bcmgenet_priv *priv)
+{
+	struct device_node *dn = priv->pdev->dev.of_node;
+
+	if (dn)
+		return bcmgenet_mii_of_init(priv);
+	else
+		return bcmgenet_mii_pd_init(priv);
+}
+
 int bcmgenet_mii_init(struct net_device *dev)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
@@ -447,7 +529,7 @@ int bcmgenet_mii_init(struct net_device *dev)
 	if (ret)
 		return ret;
 
-	ret = bcmgenet_mii_of_init(priv);
+	ret = bcmgenet_mii_bus_init(priv);
 	if (ret)
 		goto out_free;
 
diff --git a/include/linux/platform_data/bcmgenet.h b/include/linux/platform_data/bcmgenet.h
new file mode 100644
index 000000000000..26af54321958
--- /dev/null
+++ b/include/linux/platform_data/bcmgenet.h
@@ -0,0 +1,18 @@
+#ifndef __LINUX_PLATFORM_DATA_BCMGENET_H__
+#define __LINUX_PLATFORM_DATA_BCMGENET_H__
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/phy.h>
+
+struct bcmgenet_platform_data {
+	bool		mdio_enabled;
+	phy_interface_t	phy_interface;
+	int		phy_address;
+	int		phy_speed;
+	int		phy_duplex;
+	u8		mac_address[ETH_ALEN];
+	int		genet_version;
+};
+
+#endif
-- 
cgit v1.2.3-58-ga151


From 97ede29e80eead50d8bd533cf163401b88c027be Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Tue, 2 Dec 2014 15:00:30 +0800
Subject: tipc: convert name table read-write lock to RCU

Convert tipc name table read-write lock to RCU. After this change,
a new spin lock is used to protect name table on write side while
RCU is applied on read side.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist.h |  9 +++++
 net/tipc/name_distr.c   | 28 ++++++++--------
 net/tipc/name_table.c   | 87 ++++++++++++++++++++++++-------------------------
 net/tipc/name_table.h   |  4 ++-
 4 files changed, 69 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 372ad5e0dcb8..aa79b3c24f66 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -542,6 +542,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
 	     pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
 			typeof(*(pos)), member))
 
+/**
+ * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
+ * @pos:	the type * to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from_rcu(pos, member)			\
+	for (; pos;							\
+	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
+			typeof(*(pos)), member))
 
 #endif	/* __KERNEL__ */
 #endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index ed00929f16c8..ba6083dca95b 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -113,8 +113,8 @@ struct sk_buff *tipc_named_publish(struct publication *publ)
 	struct sk_buff *buf;
 	struct distr_item *item;
 
-	list_add_tail(&publ->local_list,
-		      &tipc_nametbl->publ_list[publ->scope]);
+	list_add_tail_rcu(&publ->local_list,
+			  &tipc_nametbl->publ_list[publ->scope]);
 
 	if (publ->scope == TIPC_NODE_SCOPE)
 		return NULL;
@@ -208,12 +208,12 @@ void tipc_named_node_up(u32 dnode)
 
 	__skb_queue_head_init(&head);
 
-	read_lock_bh(&tipc_nametbl_lock);
+	rcu_read_lock();
 	named_distribute(&head, dnode,
 			 &tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]);
 	named_distribute(&head, dnode,
 			 &tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]);
-	read_unlock_bh(&tipc_nametbl_lock);
+	rcu_read_unlock();
 
 	tipc_link_xmit(&head, dnode, dnode);
 }
@@ -260,12 +260,12 @@ static void tipc_publ_purge(struct publication *publ, u32 addr)
 {
 	struct publication *p;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	p = tipc_nametbl_remove_publ(publ->type, publ->lower,
 				     publ->node, publ->ref, publ->key);
 	if (p)
 		tipc_publ_unsubscribe(p, addr);
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 
 	if (p != publ) {
 		pr_err("Unable to remove publication from failed node\n"
@@ -274,7 +274,7 @@ static void tipc_publ_purge(struct publication *publ, u32 addr)
 		       publ->key);
 	}
 
-	kfree(p);
+	kfree_rcu(p, rcu);
 }
 
 void tipc_publ_notify(struct list_head *nsub_list, u32 addr)
@@ -311,7 +311,7 @@ static bool tipc_update_nametbl(struct distr_item *i, u32 node, u32 dtype)
 						ntohl(i->key));
 		if (publ) {
 			tipc_publ_unsubscribe(publ, node);
-			kfree(publ);
+			kfree_rcu(publ, rcu);
 			return true;
 		}
 	} else {
@@ -376,14 +376,14 @@ void tipc_named_rcv(struct sk_buff *buf)
 	u32 count = msg_data_sz(msg) / ITEM_SIZE;
 	u32 node = msg_orignode(msg);
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	while (count--) {
 		if (!tipc_update_nametbl(item, node, msg_type(msg)))
 			tipc_named_add_backlog(item, msg_type(msg), node);
 		item++;
 	}
 	tipc_named_process_backlog();
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 	kfree_skb(buf);
 }
 
@@ -399,12 +399,12 @@ void tipc_named_reinit(void)
 	struct publication *publ;
 	int scope;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 
 	for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++)
-		list_for_each_entry(publ, &tipc_nametbl->publ_list[scope],
-				    local_list)
+		list_for_each_entry_rcu(publ, &tipc_nametbl->publ_list[scope],
+					local_list)
 			publ->node = tipc_own_addr;
 
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 }
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 3c2e0c300fe2..aafa684c4db9 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -92,6 +92,7 @@ struct sub_seq {
  * @ns_list: links to adjacent name sequences in hash chain
  * @subscriptions: list of subscriptions for this 'type'
  * @lock: spinlock controlling access to publication lists of all sub-sequences
+ * @rcu: RCU callback head used for deferred freeing
  */
 struct name_seq {
 	u32 type;
@@ -101,10 +102,11 @@ struct name_seq {
 	struct hlist_node ns_list;
 	struct list_head subscriptions;
 	spinlock_t lock;
+	struct rcu_head rcu;
 };
 
 struct name_table *tipc_nametbl;
-DEFINE_RWLOCK(tipc_nametbl_lock);
+DEFINE_SPINLOCK(tipc_nametbl_lock);
 
 static int hash(int x)
 {
@@ -166,7 +168,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea
 	nseq->alloc = 1;
 	INIT_HLIST_NODE(&nseq->ns_list);
 	INIT_LIST_HEAD(&nseq->subscriptions);
-	hlist_add_head(&nseq->ns_list, seq_head);
+	hlist_add_head_rcu(&nseq->ns_list, seq_head);
 	return nseq;
 }
 
@@ -451,7 +453,7 @@ static struct name_seq *nametbl_find_seq(u32 type)
 	struct name_seq *ns;
 
 	seq_head = &tipc_nametbl->seq_hlist[hash(type)];
-	hlist_for_each_entry(ns, seq_head, ns_list) {
+	hlist_for_each_entry_rcu(ns, seq_head, ns_list) {
 		if (ns->type == type)
 			return ns;
 	}
@@ -498,10 +500,10 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
 	spin_lock_bh(&seq->lock);
 	publ = tipc_nameseq_remove_publ(seq, lower, node, ref, key);
 	if (!seq->first_free && list_empty(&seq->subscriptions)) {
-		hlist_del_init(&seq->ns_list);
-		spin_unlock_bh(&seq->lock);
+		hlist_del_init_rcu(&seq->ns_list);
 		kfree(seq->sseqs);
-		kfree(seq);
+		spin_unlock_bh(&seq->lock);
+		kfree_rcu(seq, rcu);
 		return publ;
 	}
 	spin_unlock_bh(&seq->lock);
@@ -533,7 +535,7 @@ u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode)
 	if (!tipc_in_scope(*destnode, tipc_own_addr))
 		return 0;
 
-	read_lock_bh(&tipc_nametbl_lock);
+	rcu_read_lock();
 	seq = nametbl_find_seq(type);
 	if (unlikely(!seq))
 		goto not_found;
@@ -590,7 +592,7 @@ u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode)
 no_match:
 	spin_unlock_bh(&seq->lock);
 not_found:
-	read_unlock_bh(&tipc_nametbl_lock);
+	rcu_read_unlock();
 	*destnode = node;
 	return ref;
 }
@@ -616,7 +618,7 @@ int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit,
 	struct name_info *info;
 	int res = 0;
 
-	read_lock_bh(&tipc_nametbl_lock);
+	rcu_read_lock();
 	seq = nametbl_find_seq(type);
 	if (!seq)
 		goto exit;
@@ -641,7 +643,7 @@ int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit,
 	}
 	spin_unlock_bh(&seq->lock);
 exit:
-	read_unlock_bh(&tipc_nametbl_lock);
+	rcu_read_unlock();
 	return res;
 }
 
@@ -654,11 +656,11 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper,
 	struct publication *publ;
 	struct sk_buff *buf = NULL;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	if (tipc_nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) {
 		pr_warn("Publication failed, local publication limit reached (%u)\n",
 			TIPC_MAX_PUBLICATIONS);
-		write_unlock_bh(&tipc_nametbl_lock);
+		spin_unlock_bh(&tipc_nametbl_lock);
 		return NULL;
 	}
 
@@ -670,7 +672,7 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper,
 		/* Any pending external events? */
 		tipc_named_process_backlog();
 	}
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 
 	if (buf)
 		named_cluster_distribute(buf);
@@ -685,7 +687,7 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key)
 	struct publication *publ;
 	struct sk_buff *skb = NULL;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key);
 	if (likely(publ)) {
 		tipc_nametbl->local_publ_count--;
@@ -693,13 +695,13 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key)
 		/* Any pending external events? */
 		tipc_named_process_backlog();
 		list_del_init(&publ->pport_list);
-		kfree(publ);
+		kfree_rcu(publ, rcu);
 	} else {
 		pr_err("Unable to remove local publication\n"
 		       "(type=%u, lower=%u, ref=%u, key=%u)\n",
 		       type, lower, ref, key);
 	}
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 
 	if (skb) {
 		named_cluster_distribute(skb);
@@ -717,7 +719,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s)
 	int index = hash(type);
 	struct name_seq *seq;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	seq = nametbl_find_seq(type);
 	if (!seq)
 		seq = tipc_nameseq_create(type,
@@ -730,7 +732,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s)
 		pr_warn("Failed to create subscription for {%u,%u,%u}\n",
 			s->seq.type, s->seq.lower, s->seq.upper);
 	}
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 }
 
 /**
@@ -740,24 +742,23 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
 {
 	struct name_seq *seq;
 
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	seq = nametbl_find_seq(s->seq.type);
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
 		list_del_init(&s->nameseq_list);
 		if (!seq->first_free && list_empty(&seq->subscriptions)) {
-			hlist_del_init(&seq->ns_list);
-			spin_unlock_bh(&seq->lock);
+			hlist_del_init_rcu(&seq->ns_list);
 			kfree(seq->sseqs);
-			kfree(seq);
+			spin_unlock_bh(&seq->lock);
+			kfree_rcu(seq, rcu);
 		} else {
 			spin_unlock_bh(&seq->lock);
 		}
 	}
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 }
 
-
 /**
  * subseq_list - print specified sub-sequence contents into the given buffer
  */
@@ -880,7 +881,7 @@ static int nametbl_list(char *buf, int len, u32 depth_info,
 		upbound = ~0;
 		for (i = 0; i < TIPC_NAMETBL_SIZE; i++) {
 			seq_head = &tipc_nametbl->seq_hlist[i];
-			hlist_for_each_entry(seq, seq_head, ns_list) {
+			hlist_for_each_entry_rcu(seq, seq_head, ns_list) {
 				ret += nameseq_list(seq, buf + ret, len - ret,
 						   depth, seq->type,
 						   lowbound, upbound, i);
@@ -896,7 +897,7 @@ static int nametbl_list(char *buf, int len, u32 depth_info,
 		ret += nametbl_header(buf + ret, len - ret, depth);
 		i = hash(type);
 		seq_head = &tipc_nametbl->seq_hlist[i];
-		hlist_for_each_entry(seq, seq_head, ns_list) {
+		hlist_for_each_entry_rcu(seq, seq_head, ns_list) {
 			if (seq->type == type) {
 				ret += nameseq_list(seq, buf + ret, len - ret,
 						   depth, type,
@@ -928,11 +929,11 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
 	pb = TLV_DATA(rep_tlv);
 	pb_len = ULTRA_STRING_MAX_LEN;
 	argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area);
-	read_lock_bh(&tipc_nametbl_lock);
+	rcu_read_lock();
 	str_len = nametbl_list(pb, pb_len, ntohl(argv->depth),
 			       ntohl(argv->type),
 			       ntohl(argv->lowbound), ntohl(argv->upbound));
-	read_unlock_bh(&tipc_nametbl_lock);
+	rcu_read_unlock();
 	str_len += 1;	/* for "\0" */
 	skb_put(buf, TLV_SPACE(str_len));
 	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
@@ -974,13 +975,13 @@ static void tipc_purge_publications(struct name_seq *seq)
 	list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) {
 		tipc_nametbl_remove_publ(publ->type, publ->lower, publ->node,
 					 publ->ref, publ->key);
-		kfree(publ);
+		kfree_rcu(publ, rcu);
 	}
-	hlist_del_init(&seq->ns_list);
+	hlist_del_init_rcu(&seq->ns_list);
+	kfree(seq->sseqs);
 	spin_lock_bh(&seq->lock);
 
-	kfree(seq->sseqs);
-	kfree(seq);
+	kfree_rcu(seq, rcu);
 }
 
 void tipc_nametbl_stop(void)
@@ -988,22 +989,22 @@ void tipc_nametbl_stop(void)
 	u32 i;
 	struct name_seq *seq;
 	struct hlist_head *seq_head;
-	struct hlist_node *safe;
 
 	/* Verify name table is empty and purge any lingering
 	 * publications, then release the name table
 	 */
-	write_lock_bh(&tipc_nametbl_lock);
+	spin_lock_bh(&tipc_nametbl_lock);
 	for (i = 0; i < TIPC_NAMETBL_SIZE; i++) {
 		if (hlist_empty(&tipc_nametbl->seq_hlist[i]))
 			continue;
 		seq_head = &tipc_nametbl->seq_hlist[i];
-		hlist_for_each_entry_safe(seq, safe, seq_head, ns_list) {
+		hlist_for_each_entry_rcu(seq, seq_head, ns_list) {
 			tipc_purge_publications(seq);
 		}
 	}
-	write_unlock_bh(&tipc_nametbl_lock);
+	spin_unlock_bh(&tipc_nametbl_lock);
 
+	synchronize_net();
 	kfree(tipc_nametbl);
 
 }
@@ -1109,7 +1110,7 @@ static int __tipc_nl_seq_list(struct tipc_nl_msg *msg, u32 *last_type,
 			      u32 *last_lower, u32 *last_publ)
 {
 	struct hlist_head *seq_head;
-	struct name_seq *seq;
+	struct name_seq *seq = NULL;
 	int err;
 	int i;
 
@@ -1126,13 +1127,13 @@ static int __tipc_nl_seq_list(struct tipc_nl_msg *msg, u32 *last_type,
 			if (!seq)
 				return -EPIPE;
 		} else {
-			seq = hlist_entry_safe((seq_head)->first,
-					       struct name_seq, ns_list);
+			hlist_for_each_entry_rcu(seq, seq_head, ns_list)
+				break;
 			if (!seq)
 				continue;
 		}
 
-		hlist_for_each_entry_from(seq, ns_list) {
+		hlist_for_each_entry_from_rcu(seq, ns_list) {
 			spin_lock_bh(&seq->lock);
 			err = __tipc_nl_subseq_list(msg, seq, last_lower,
 						    last_publ);
@@ -1165,8 +1166,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	msg.portid = NETLINK_CB(cb->skb).portid;
 	msg.seq = cb->nlh->nlmsg_seq;
 
-	read_lock_bh(&tipc_nametbl_lock);
-
+	rcu_read_lock();
 	err = __tipc_nl_seq_list(&msg, &last_type, &last_lower, &last_publ);
 	if (!err) {
 		done = 1;
@@ -1179,8 +1179,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		 */
 		cb->prev_seq = 1;
 	}
-
-	read_unlock_bh(&tipc_nametbl_lock);
+	rcu_read_unlock();
 
 	cb->args[0] = last_type;
 	cb->args[1] = last_lower;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index c1fd734eb0d5..5f0dee92010d 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -62,6 +62,7 @@ struct tipc_port_list;
  * @node_list: adjacent matching name seq publications with >= node scope
  * @cluster_list: adjacent matching name seq publications with >= cluster scope
  * @zone_list: adjacent matching name seq publications with >= zone scope
+ * @rcu: RCU callback head used for deferred freeing
  *
  * Note that the node list, cluster list, and zone list are circular lists.
  */
@@ -79,6 +80,7 @@ struct publication {
 	struct list_head node_list;
 	struct list_head cluster_list;
 	struct list_head zone_list;
+	struct rcu_head rcu;
 };
 
 /**
@@ -93,7 +95,7 @@ struct name_table {
 	u32 local_publ_count;
 };
 
-extern rwlock_t tipc_nametbl_lock;
+extern spinlock_t tipc_nametbl_lock;
 extern struct name_table *tipc_nametbl;
 
 int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
-- 
cgit v1.2.3-58-ga151


From 0c7aac854f52d3302b88fd599216a810f490ab1f Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@dev.mellanox.co.il>
Date: Tue, 2 Dec 2014 12:26:14 +0200
Subject: net/mlx5_core: Remove unused dev cap enum fields

These enumerations are not used so remove them.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 1d67fd32e71c..ea4f1c46f761 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -219,23 +219,15 @@ enum {
 };
 
 enum {
-	MLX5_DEV_CAP_FLAG_RC		= 1LL <<  0,
-	MLX5_DEV_CAP_FLAG_UC		= 1LL <<  1,
-	MLX5_DEV_CAP_FLAG_UD		= 1LL <<  2,
 	MLX5_DEV_CAP_FLAG_XRC		= 1LL <<  3,
-	MLX5_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
 	MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
 	MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
 	MLX5_DEV_CAP_FLAG_APM		= 1LL << 17,
 	MLX5_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
 	MLX5_DEV_CAP_FLAG_BLOCK_MCAST	= 1LL << 23,
-	MLX5_DEV_CAP_FLAG_ON_DMND_PG	= 1LL << 24,
 	MLX5_DEV_CAP_FLAG_CQ_MODER	= 1LL << 29,
 	MLX5_DEV_CAP_FLAG_RESIZE_CQ	= 1LL << 30,
-	MLX5_DEV_CAP_FLAG_RESIZE_SRQ	= 1LL << 32,
 	MLX5_DEV_CAP_FLAG_DCT		= 1LL << 37,
-	MLX5_DEV_CAP_FLAG_REMOTE_FENCE	= 1LL << 38,
-	MLX5_DEV_CAP_FLAG_TLP_HINTS	= 1LL << 39,
 	MLX5_DEV_CAP_FLAG_SIG_HAND_OVER	= 1LL << 40,
 	MLX5_DEV_CAP_FLAG_CMDIF_CSUM	= 3LL << 46,
 };
-- 
cgit v1.2.3-58-ga151


From 9c0c112422a2a6b06fcddcaf21957676490cebba Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 3 Dec 2014 08:17:33 -0800
Subject: net: Add functions for handling padding frame and adding to length

This patch adds two new helper functions skb_put_padto and eth_skb_pad.
These functions deviate from the standard skb_pad or skb_padto in that they
will also update the length and tail pointers so that they reflect the
padding added to the frame.

The eth_skb_pad helper is meant to be used with Ethernet devices to update
either Rx or Tx frames so that they report the correct size.  The
skb_put_padto helper is meant to be used primarily in the transmit path for
network devices that need frames to be padded up to some minimum size and
don't wish to simply update the length somewhere external to the frame.

The motivation behind this is that there are a number of implementations
throughout the network device drivers that are all doing the same thing,
but each a little bit differently and as a result several implementations
contain bugs such as updating the length without updating the tail offset
and other similar issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 12 ++++++++++++
 include/linux/skbuff.h      | 24 +++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 733980fce8e3..41c891d05f04 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -392,4 +392,16 @@ static inline unsigned long compare_ether_header(const void *a, const void *b)
 #endif
 }
 
+/**
+ * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
+ * @skb: Buffer to pad
+ *
+ * An Ethernet frame should have a minimum size of 60 bytes.  This function
+ * takes short frames and pads them with zeros up to the 60 byte limit.
+ */
+static inline int eth_skb_pad(struct sk_buff *skb)
+{
+	return skb_put_padto(skb, ETH_ZLEN);
+}
+
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7691ad5b4771..d1e2575000b9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2461,7 +2461,6 @@ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
  *	is untouched. Otherwise it is extended. Returns zero on
  *	success. The skb is freed on error.
  */
- 
 static inline int skb_padto(struct sk_buff *skb, unsigned int len)
 {
 	unsigned int size = skb->len;
@@ -2470,6 +2469,29 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len)
 	return skb_pad(skb, len - size);
 }
 
+/**
+ *	skb_put_padto - increase size and pad an skbuff up to a minimal size
+ *	@skb: buffer to pad
+ *	@len: minimal length
+ *
+ *	Pads up a buffer to ensure the trailing bytes exist and are
+ *	blanked. If the buffer already contains sufficient data it
+ *	is untouched. Otherwise it is extended. Returns zero on
+ *	success. The skb is freed on error.
+ */
+static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+{
+	unsigned int size = skb->len;
+
+	if (unlikely(size < len)) {
+		len -= size;
+		if (skb_pad(skb, len))
+			return -ENOMEM;
+		__skb_put(skb, len);
+	}
+	return 0;
+}
+
 static inline int skb_add_data(struct sk_buff *skb,
 			       char __user *from, int copy)
 {
-- 
cgit v1.2.3-58-ga151


From 892311f66f2411b813ca631009356891a0c2b0a1 Mon Sep 17 00:00:00 2001
From: Eyal Perry <eyalpe@mellanox.com>
Date: Tue, 2 Dec 2014 18:12:10 +0200
Subject: ethtool: Support for configurable RSS hash function

This patch extends the set/get_rxfh ethtool-options for getting or
setting the RSS hash function.

It modifies drivers implementation of set/get_rxfh accordingly.

This change also delegates the responsibility of checking whether a
modification to a certain RX flow hash parameter is supported to the
driver implementation of set_rxfh.

User-kernel API is done through the new hfunc bitmask field in the
ethtool_rxfh struct. A bit set in the hfunc field is corresponding to an
index in the new string-set ETH_SS_RSS_HASH_FUNCS.

Got approval from most of the relevant driver maintainers that their
driver is using Toeplitz, and for the few that didn't answered, also
assumed it is Toeplitz.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Ariel Elior <ariel.elior@qlogic.com>
Cc: Prashant Sreedharan <prashant@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Cc: Hariprasad S <hariprasad@chelsio.com>
Cc: Sathya Perla <sathya.perla@emulex.com>
Cc: Subbu Seetharaman <subbu.seetharaman@emulex.com>
Cc: Ajit Khaparde <ajit.khaparde@emulex.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Bruce Allan <bruce.w.allan@intel.com>
Cc: Carolyn Wyborny <carolyn.wyborny@intel.com>
Cc: Don Skidmore <donald.c.skidmore@intel.com>
Cc: Greg Rose <gregory.v.rose@intel.com>
Cc: Matthew Vick <matthew.vick@intel.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Mitch Williams <mitch.a.williams@intel.com>
Cc: Amir Vadai <amirv@mellanox.com>
Cc: Solarflare linux maintainers <linux-net-drivers@solarflare.com>
Cc: Shradha Shah <sshah@solarflare.com>
Cc: Shreyas Bhatewara <sbhatewara@vmware.com>
Cc: "VMware, Inc." <pv-drivers@vmware.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c       | 11 +++-
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    | 20 ++++++-
 drivers/net/ethernet/broadcom/tg3.c                | 20 ++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    | 18 +++++-
 drivers/net/ethernet/emulex/benet/be_ethtool.c     | 12 +++-
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c   | 12 +++-
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 17 +++++-
 drivers/net/ethernet/intel/igb/igb_ethtool.c       | 16 ++++-
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    | 13 +++-
 drivers/net/ethernet/sfc/ethtool.c                 | 18 ++++--
 drivers/net/vmxnet3/vmxnet3_ethtool.c              | 15 ++++-
 include/linux/ethtool.h                            | 42 +++++++++----
 include/uapi/linux/ethtool.h                       | 10 +++-
 net/core/ethtool.c                                 | 69 ++++++++++++----------
 14 files changed, 223 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 95d44538357f..ebf489351555 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -511,7 +511,8 @@ static u32 xgbe_get_rxfh_indir_size(struct net_device *netdev)
 	return ARRAY_SIZE(pdata->rss_table);
 }
 
-static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			 u8 *hfunc)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int i;
@@ -525,16 +526,22 @@ static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
 	if (key)
 		memcpy(key, pdata->rss_key, sizeof(pdata->rss_key));
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
 	return 0;
 }
 
 static int xgbe_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *key)
+			 const u8 *key, const u8 hfunc)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	struct xgbe_hw_if *hw_if = &pdata->hw_if;
 	unsigned int ret;
 
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
 	if (indir) {
 		ret = hw_if->set_rss_lookup_table(pdata, indir);
 		if (ret)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 1edc931b1458..ffe4e003e636 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3358,12 +3358,18 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
 	return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
-static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key)
+static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
+			  u8 *hfunc)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	u8 ind_table[T_ETH_INDIRECTION_TABLE_SIZE] = {0};
 	size_t i;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!indir)
+		return 0;
+
 	/* Get the current configuration of the RSS indirection table */
 	bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
 
@@ -3383,11 +3389,21 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key)
 }
 
 static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir,
-			  const u8 *key)
+			  const u8 *key, const u8 hfunc)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	size_t i;
 
+	/* We require at least one supported parameter to be changed and no
+	 * change in any of the unsupported parameters
+	 */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+
+	if (!indir)
+		return 0;
+
 	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++) {
 		/*
 		 * The same as in bnx2x_get_rxfh: we can't use a memcpy()
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 43fd1b72c1ea..bb48a610b72a 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12561,22 +12561,38 @@ static u32 tg3_get_rxfh_indir_size(struct net_device *dev)
 	return size;
 }
 
-static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key)
+static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	int i;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!indir)
+		return 0;
+
 	for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++)
 		indir[i] = tp->rss_ind_tbl[i];
 
 	return 0;
 }
 
-static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key)
+static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key,
+			const u8 hfunc)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	size_t i;
 
+	/* We require at least one supported parameter to be changed and no
+	 * change in any of the unsupported parameters
+	 */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+
+	if (!indir)
+		return 0;
+
 	for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++)
 		tp->rss_ind_tbl[i] = indir[i];
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 3aea82bb9039..e7342bc85026 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2923,21 +2923,35 @@ static u32 get_rss_table_size(struct net_device *dev)
 	return pi->rss_size;
 }
 
-static int get_rss_table(struct net_device *dev, u32 *p, u8 *key)
+static int get_rss_table(struct net_device *dev, u32 *p, u8 *key, u8 *hfunc)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	unsigned int n = pi->rss_size;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!p)
+		return 0;
 	while (n--)
 		p[n] = pi->rss[n];
 	return 0;
 }
 
-static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key)
+static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key,
+			 const u8 hfunc)
 {
 	unsigned int i;
 	struct port_info *pi = netdev_priv(dev);
 
+	/* We require at least one supported parameter to be changed and no
+	 * change in any of the unsupported parameters
+	 */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!p)
+		return 0;
+
 	for (i = 0; i < pi->rss_size; i++)
 		pi->rss[i] = p[i];
 	if (pi->adapter->flags & FULL_INIT_DONE)
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index e42a791c1835..73a500ccbf69 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1171,7 +1171,8 @@ static u32 be_get_rxfh_key_size(struct net_device *netdev)
 	return RSS_HASH_KEY_LEN;
 }
 
-static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey)
+static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey,
+		       u8 *hfunc)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	int i;
@@ -1185,16 +1186,23 @@ static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey)
 	if (hkey)
 		memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN);
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
 	return 0;
 }
 
 static int be_set_rxfh(struct net_device *netdev, const u32 *indir,
-		       const u8 *hkey)
+		       const u8 *hkey, const u8 hfunc)
 {
 	int rc = 0, i, j;
 	struct be_adapter *adapter = netdev_priv(netdev);
 	u8 rsstable[RSS_INDIR_TABLE_LEN];
 
+	/* We do not allow change in unsupported parameters */
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
 	if (indir) {
 		struct be_rx_obj *rxo;
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 2d04464e6aa3..651f53bc7376 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -916,11 +916,15 @@ static u32 fm10k_get_rssrk_size(struct net_device *netdev)
 	return FM10K_RSSRK_SIZE * FM10K_RSSRK_ENTRIES_PER_REG;
 }
 
-static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key)
+static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key,
+			  u8 *hfunc)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	int i, err;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
 	err = fm10k_get_reta(netdev, indir);
 	if (err || !key)
 		return err;
@@ -932,12 +936,16 @@ static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key)
 }
 
 static int fm10k_set_rssh(struct net_device *netdev, const u32 *indir,
-			  const u8 *key)
+			  const u8 *key, const u8 hfunc)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_hw *hw = &interface->hw;
 	int i, err;
 
+	/* We do not allow change in unsupported parameters */
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
 	err = fm10k_set_reta(netdev, indir);
 	if (err || !key)
 		return err;
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 69a269b23be6..69b97bac182c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -627,13 +627,19 @@ static u32 i40evf_get_rxfh_indir_size(struct net_device *netdev)
  *
  * Reads the indirection table directly from the hardware. Always returns 0.
  **/
-static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			   u8 *hfunc)
 {
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 	struct i40e_hw *hw = &adapter->hw;
 	u32 hlut_val;
 	int i, j;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!indir)
+		return 0;
+
 	for (i = 0, j = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++) {
 		hlut_val = rd32(hw, I40E_VFQF_HLUT(i));
 		indir[j++] = hlut_val & 0xff;
@@ -654,13 +660,20 @@ static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
  * returns 0 after programming the table.
  **/
 static int i40evf_set_rxfh(struct net_device *netdev, const u32 *indir,
-			   const u8 *key)
+			   const u8 *key, const u8 hfunc)
 {
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 	struct i40e_hw *hw = &adapter->hw;
 	u32 hlut_val;
 	int i, j;
 
+	/* We do not allow change in unsupported parameters */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!indir)
+		return 0;
+
 	for (i = 0, j = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++) {
 		hlut_val = indir[j++];
 		hlut_val |= indir[j++] << 8;
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 02cfd3b14762..d5673eb90c54 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2842,11 +2842,16 @@ static u32 igb_get_rxfh_indir_size(struct net_device *netdev)
 	return IGB_RETA_SIZE;
 }
 
-static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key)
+static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			u8 *hfunc)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int i;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!indir)
+		return 0;
 	for (i = 0; i < IGB_RETA_SIZE; i++)
 		indir[i] = adapter->rss_indir_tbl[i];
 
@@ -2889,13 +2894,20 @@ void igb_write_rss_indir_tbl(struct igb_adapter *adapter)
 }
 
 static int igb_set_rxfh(struct net_device *netdev, const u32 *indir,
-			const u8 *key)
+			const u8 *key, const u8 hfunc)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
 	int i;
 	u32 num_queues;
 
+	/* We do not allow change in unsupported parameters */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!indir)
+		return 0;
+
 	num_queues = adapter->rss_queues;
 
 	switch (hw->mac.type) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index c45e06abc073..28c3fc5a0791 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -978,7 +978,8 @@ static u32 mlx4_en_get_rxfh_key_size(struct net_device *netdev)
 	return MLX4_EN_RSS_KEY_SIZE;
 }
 
-static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
+static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key,
+			    u8 *hfunc)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
@@ -990,16 +991,20 @@ static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
 	rss_rings = 1 << ilog2(rss_rings);
 
 	while (n--) {
+		if (!ring_index)
+			break;
 		ring_index[n] = rss_map->qps[n % rss_rings].qpn -
 			rss_map->base_qpn;
 	}
 	if (key)
 		memcpy(key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE);
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
 	return err;
 }
 
 static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
-			    const u8 *key)
+			    const u8 *key, const u8 hfunc)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -1008,6 +1013,10 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
 	int i;
 	int rss_rings = 0;
 
+	/* We do not allow change in unsupported parameters */
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
 	/* Calculate RSS table size and make sure flows are spread evenly
 	 * between rings
 	 */
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index cad258a78708..4835bc0d0de8 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -1086,19 +1086,29 @@ static u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev)
 		0 : ARRAY_SIZE(efx->rx_indir_table));
 }
 
-static int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key)
+static int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
+				u8 *hfunc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
-	memcpy(indir, efx->rx_indir_table, sizeof(efx->rx_indir_table));
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (indir)
+		memcpy(indir, efx->rx_indir_table, sizeof(efx->rx_indir_table));
 	return 0;
 }
 
-static int efx_ethtool_set_rxfh(struct net_device *net_dev,
-				const u32 *indir, const u8 *key)
+static int efx_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
+				const u8 *key, const u8 hfunc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
+	/* We do not allow change in unsupported parameters */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!indir)
+		return 0;
 	memcpy(efx->rx_indir_table, indir, sizeof(efx->rx_indir_table));
 	efx->type->rx_push_rss_config(efx);
 	return 0;
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index b725fd9e7803..b7b53329d575 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -583,12 +583,16 @@ vmxnet3_get_rss_indir_size(struct net_device *netdev)
 }
 
 static int
-vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key)
+vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key, u8 *hfunc)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
 	unsigned int n = rssConf->indTableSize;
 
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!p)
+		return 0;
 	while (n--)
 		p[n] = rssConf->indTable[n];
 	return 0;
@@ -596,13 +600,20 @@ vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key)
 }
 
 static int
-vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key)
+vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key,
+		const u8 hfunc)
 {
 	unsigned int i;
 	unsigned long flags;
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
 
+	/* We do not allow change in unsupported parameters */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!p)
+		return 0;
 	for (i = 0; i < rssConf->indTableSize; i++)
 		rssConf->indTable[i] = p[i];
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c1a2d60dfb82..653dc9c4ebac 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -59,6 +59,26 @@ enum ethtool_phys_id_state {
 	ETHTOOL_ID_OFF
 };
 
+enum {
+	ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */
+	ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */
+
+	/*
+	 * Add your fresh new hash function bits above and remember to update
+	 * rss_hash_func_strings[] in ethtool.c
+	 */
+	ETH_RSS_HASH_FUNCS_COUNT
+};
+
+#define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
+#define __ETH_RSS_HASH(name)	__ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT)
+
+#define ETH_RSS_HASH_TOP	__ETH_RSS_HASH(TOP)
+#define ETH_RSS_HASH_XOR	__ETH_RSS_HASH(XOR)
+
+#define ETH_RSS_HASH_UNKNOWN	0
+#define ETH_RSS_HASH_NO_CHANGE	0
+
 struct net_device;
 
 /* Some generic methods drivers may use in their ethtool_ops */
@@ -158,17 +178,14 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	Returns zero if not supported for this specific device.
  * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table.
  *	Returns zero if not supported for this specific device.
- * @get_rxfh: Get the contents of the RX flow hash indirection table and hash
- *	key.
- *	Will only be called if one or both of @get_rxfh_indir_size and
- *	@get_rxfh_key_size are implemented and return non-zero.
- *	Returns a negative error code or zero.
- * @set_rxfh: Set the contents of the RX flow hash indirection table and/or
- *	hash key.  In case only the indirection table or hash key is to be
- *	changed, the other argument will be %NULL.
- *	Will only be called if one or both of @get_rxfh_indir_size and
- *	@get_rxfh_key_size are implemented and return non-zero.
+ * @get_rxfh: Get the contents of the RX flow hash indirection table, hash key
+ *	and/or hash function.
  *	Returns a negative error code or zero.
+ * @set_rxfh: Set the contents of the RX flow hash indirection table, hash
+ *	key, and/or hash function.  Arguments which are set to %NULL or zero
+ *	will remain unchanged.
+ *	Returns a negative error code or zero. An error code must be returned
+ *	if at least one unsupported change was requested.
  * @get_channels: Get number of channels.
  * @set_channels: Set number of channels.  Returns a negative error code or
  *	zero.
@@ -241,9 +258,10 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
-	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key);
+	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key,
+			    u8 *hfunc);
 	int	(*set_rxfh)(struct net_device *, const u32 *indir,
-			    const u8 *key);
+			    const u8 *key, const u8 hfunc);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*get_dump_flag)(struct net_device *, struct ethtool_dump *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index eb2095b42fbb..5f66d9c2889d 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -534,6 +534,7 @@ struct ethtool_pauseparam {
  * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE;
  *	now deprecated
  * @ETH_SS_FEATURES: Device feature names
+ * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
  */
 enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
@@ -541,6 +542,7 @@ enum ethtool_stringset {
 	ETH_SS_PRIV_FLAGS,
 	ETH_SS_NTUPLE_FILTERS,
 	ETH_SS_FEATURES,
+	ETH_SS_RSS_HASH_FUNCS,
 };
 
 /**
@@ -884,6 +886,8 @@ struct ethtool_rxfh_indir {
  * @key_size: On entry, the array size of the user buffer for the hash key,
  *	which may be zero.  On return from %ETHTOOL_GRSSH, the size of the
  *	hardware hash key.
+ * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
+ *	Valid values are one of the %ETH_RSS_HASH_*.
  * @rsvd:	Reserved for future extensions.
  * @rss_config: RX ring/queue index for each hash value i.e., indirection table
  *	of @indir_size __u32 elements, followed by hash key of @key_size
@@ -893,14 +897,16 @@ struct ethtool_rxfh_indir {
  * size should be returned.  For %ETHTOOL_SRSSH, an @indir_size of
  * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested
  * and a @indir_size of zero means the indir table should be reset to default
- * values.
+ * values. An hfunc of zero means that hash function setting is not requested.
  */
 struct ethtool_rxfh {
 	__u32   cmd;
 	__u32	rss_context;
 	__u32   indir_size;
 	__u32   key_size;
-	__u32	rsvd[2];
+	__u8	hfunc;
+	__u8	rsvd8[3];
+	__u32	rsvd32;
 	__u32   rss_config[0];
 };
 #define ETH_RXFH_INDIR_NO_CHANGE	0xffffffff
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 715f51f321e9..550892cd6b3f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 };
 
+static const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+	[ETH_RSS_HASH_TOP_BIT] =	"toeplitz",
+	[ETH_RSS_HASH_XOR_BIT] =	"xor",
+};
+
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_gfeatures cmd = {
@@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 	if (sset == ETH_SS_FEATURES)
 		return ARRAY_SIZE(netdev_features_strings);
 
+	if (sset == ETH_SS_RSS_HASH_FUNCS)
+		return ARRAY_SIZE(rss_hash_func_strings);
+
 	if (ops->get_sset_count && ops->get_strings)
 		return ops->get_sset_count(dev, sset);
 	else
@@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev,
 	if (stringset == ETH_SS_FEATURES)
 		memcpy(data, netdev_features_strings,
 			sizeof(netdev_features_strings));
+	else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+		memcpy(data, rss_hash_func_strings,
+		       sizeof(rss_hash_func_strings));
 	else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
@@ -618,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 	if (!indir)
 		return -ENOMEM;
 
-	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
 	if (ret)
 		goto out;
 
@@ -679,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 			goto out;
 	}
 
-	ret = ops->set_rxfh(dev, indir, NULL);
+	ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
 
 out:
 	kfree(indir);
@@ -697,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	u32 total_size;
 	u32 indir_bytes;
 	u32 *indir = NULL;
+	u8 dev_hfunc = 0;
 	u8 *hkey = NULL;
 	u8 *rss_config;
 
-	if (!(dev->ethtool_ops->get_rxfh_indir_size ||
-	      dev->ethtool_ops->get_rxfh_key_size) ||
-	      !dev->ethtool_ops->get_rxfh)
+	if (!ops->get_rxfh)
 		return -EOPNOTSUPP;
 
 	if (ops->get_rxfh_indir_size)
@@ -710,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (ops->get_rxfh_key_size)
 		dev_key_size = ops->get_rxfh_key_size(dev);
 
-	if ((dev_key_size + dev_indir_size) == 0)
-		return -EOPNOTSUPP;
-
 	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
 		return -EFAULT;
 	user_indir_size = rxfh.indir_size;
 	user_key_size = rxfh.key_size;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+	if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+	    rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
 
 	rxfh.indir_size = dev_indir_size;
@@ -727,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
 		return -EFAULT;
 
-	/* If the user buffer size is 0, this is just a query for the
-	 * device table size and key size.  Otherwise, if the User size is
-	 * not equal to device table size or key size it's an error.
-	 */
-	if (!user_indir_size && !user_key_size)
-		return 0;
-
 	if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
 	    (user_key_size && (user_key_size != dev_key_size)))
 		return -EINVAL;
@@ -750,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (user_key_size)
 		hkey = rss_config + indir_bytes;
 
-	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
-	if (!ret) {
-		if (copy_to_user(useraddr +
-				 offsetof(struct ethtool_rxfh, rss_config[0]),
-				 rss_config, total_size))
-			ret = -EFAULT;
-	}
+	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+	if (ret)
+		goto out;
 
+	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+			 &dev_hfunc, sizeof(rxfh.hfunc))) {
+		ret = -EFAULT;
+	} else if (copy_to_user(useraddr +
+			      offsetof(struct ethtool_rxfh, rss_config[0]),
+			      rss_config, total_size)) {
+		ret = -EFAULT;
+	}
+out:
 	kfree(rss_config);
 
 	return ret;
@@ -776,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	u8 *rss_config;
 	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
 
-	if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
-	    !ops->get_rxnfc || !ops->set_rxfh)
+	if (!ops->get_rxnfc || !ops->set_rxfh)
 		return -EOPNOTSUPP;
 
 	if (ops->get_rxfh_indir_size)
 		dev_indir_size = ops->get_rxfh_indir_size(dev);
 	if (ops->get_rxfh_key_size)
 		dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
-	if ((dev_key_size + dev_indir_size) == 0)
-		return -EOPNOTSUPP;
 
 	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
 		return -EFAULT;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+	if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+	    rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
 
-	/* If either indir or hash key is valid, proceed further.
-	 * It is not valid to request that both be unchanged.
+	/* If either indir, hash key or function is valid, proceed further.
+	 * Must request at least one change: indir size, hash key or function.
 	 */
 	if ((rxfh.indir_size &&
 	     rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
 	     rxfh.indir_size != dev_indir_size) ||
 	    (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
 	    (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
-	     rxfh.key_size == 0))
+	     rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
 		return -EINVAL;
 
 	if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
@@ -845,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		}
 	}
 
-	ret = ops->set_rxfh(dev, indir, hkey);
+	ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
 
 out:
 	kfree(rss_config);
-- 
cgit v1.2.3-58-ga151


From 395eea6ccf2b253f81b4718ffbcae67d36fe2e69 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Wed, 3 Dec 2014 13:46:24 -0800
Subject: rtnetlink: delay RTM_DELLINK notification until after ndo_uninit()

The commit 56bfa7ee7c ("unregister_netdevice : move RTM_DELLINK to
until after ndo_uninit") tried to do this ealier but while doing so
it created a problem. Unfortunately the delayed rtmsg_ifinfo() also
delayed call to fill_info(). So this translated into asking driver
to remove private state and then query it's private state. This
could have catastropic consequences.

This change breaks the rtmsg_ifinfo() into two parts - one takes the
precise snapshot of the device by called fill_info() before calling
the ndo_uninit() and the second part sends the notification using
collected snapshot.

It was brought to notice when last link is deleted from an ipvlan device
when it has free-ed the port and the subsequent .fill_info() call is
trying to get the info from the port.

kernel: [  255.139429] ------------[ cut here ]------------
kernel: [  255.139439] WARNING: CPU: 12 PID: 11173 at net/core/rtnetlink.c:2238 rtmsg_ifinfo+0x100/0x110()
kernel: [  255.139493] Modules linked in: ipvlan bonding w1_therm ds2482 wire cdc_acm ehci_pci ehci_hcd i2c_dev i2c_i801 i2c_core msr cpuid bnx2x ptp pps_core mdio libcrc32c
kernel: [  255.139513] CPU: 12 PID: 11173 Comm: ip Not tainted 3.18.0-smp-DEV #167
kernel: [  255.139514] Hardware name: Intel RML,PCH/Ibis_QC_18, BIOS 1.0.10 05/15/2012
kernel: [  255.139515]  0000000000000009 ffff880851b6b828 ffffffff815d87f4 00000000000000e0
kernel: [  255.139516]  0000000000000000 ffff880851b6b868 ffffffff8109c29c 0000000000000000
kernel: [  255.139518]  00000000ffffffa6 00000000000000d0 ffffffff81aaf580 0000000000000011
kernel: [  255.139520] Call Trace:
kernel: [  255.139527]  [<ffffffff815d87f4>] dump_stack+0x46/0x58
kernel: [  255.139531]  [<ffffffff8109c29c>] warn_slowpath_common+0x8c/0xc0
kernel: [  255.139540]  [<ffffffff8109c2ea>] warn_slowpath_null+0x1a/0x20
kernel: [  255.139544]  [<ffffffff8150d570>] rtmsg_ifinfo+0x100/0x110
kernel: [  255.139547]  [<ffffffff814f78b5>] rollback_registered_many+0x1d5/0x2d0
kernel: [  255.139549]  [<ffffffff814f79cf>] unregister_netdevice_many+0x1f/0xb0
kernel: [  255.139551]  [<ffffffff8150acab>] rtnl_dellink+0xbb/0x110
kernel: [  255.139553]  [<ffffffff8150da90>] rtnetlink_rcv_msg+0xa0/0x240
kernel: [  255.139557]  [<ffffffff81329283>] ? rhashtable_lookup_compare+0x43/0x80
kernel: [  255.139558]  [<ffffffff8150d9f0>] ? __rtnl_unlock+0x20/0x20
kernel: [  255.139562]  [<ffffffff8152cb11>] netlink_rcv_skb+0xb1/0xc0
kernel: [  255.139563]  [<ffffffff8150a495>] rtnetlink_rcv+0x25/0x40
kernel: [  255.139565]  [<ffffffff8152c398>] netlink_unicast+0x178/0x230
kernel: [  255.139567]  [<ffffffff8152c75f>] netlink_sendmsg+0x30f/0x420
kernel: [  255.139571]  [<ffffffff814e0b0c>] sock_sendmsg+0x9c/0xd0
kernel: [  255.139575]  [<ffffffff811d1d7f>] ? rw_copy_check_uvector+0x6f/0x130
kernel: [  255.139577]  [<ffffffff814e11c9>] ? copy_msghdr_from_user+0x139/0x1b0
kernel: [  255.139578]  [<ffffffff814e1774>] ___sys_sendmsg+0x304/0x310
kernel: [  255.139581]  [<ffffffff81198723>] ? handle_mm_fault+0xca3/0xde0
kernel: [  255.139585]  [<ffffffff811ebc4c>] ? destroy_inode+0x3c/0x70
kernel: [  255.139589]  [<ffffffff8108e6ec>] ? __do_page_fault+0x20c/0x500
kernel: [  255.139597]  [<ffffffff811e8336>] ? dput+0xb6/0x190
kernel: [  255.139606]  [<ffffffff811f05f6>] ? mntput+0x26/0x40
kernel: [  255.139611]  [<ffffffff811d2b94>] ? __fput+0x174/0x1e0
kernel: [  255.139613]  [<ffffffff814e2129>] __sys_sendmsg+0x49/0x90
kernel: [  255.139615]  [<ffffffff814e2182>] SyS_sendmsg+0x12/0x20
kernel: [  255.139617]  [<ffffffff815df092>] system_call_fastpath+0x12/0x17
kernel: [  255.139619] ---[ end trace 5e6703e87d984f6b ]---

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reported-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  5 +++++
 net/core/dev.c            | 12 +++++++++---
 net/core/rtnetlink.c      | 25 +++++++++++++++++++++----
 3 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3b0419072f88..5db76a32fcab 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,6 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+				       unsigned change, gfp_t flags);
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
+		       gfp_t flags);
+
 
 /* RTNL is used as a global lock for all changes to network configuration  */
 extern void rtnl_lock(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0814a560e5f3..dd3bf582e6f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5925,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head)
 	synchronize_net();
 
 	list_for_each_entry(dev, head, unreg_list) {
+		struct sk_buff *skb = NULL;
+
 		/* Shutdown queueing discipline. */
 		dev_shutdown(dev);
 
@@ -5934,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head)
 		*/
 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
+		if (!dev->rtnl_link_ops ||
+		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+						     GFP_KERNEL);
+
 		/*
 		 *	Flush the unicast and multicast chains
 		 */
@@ -5943,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
 		if (dev->netdev_ops->ndo_uninit)
 			dev->netdev_ops->ndo_uninit(dev);
 
-		if (!dev->rtnl_link_ops ||
-		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+		if (skb)
+			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 
 		/* Notifier chain MUST detach us all upper devices. */
 		WARN_ON(netdev_has_any_upper_dev(dev));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 61cb7e7cc3c7..a9be2c161702 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2245,8 +2245,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+				       unsigned int change, gfp_t flags)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2264,11 +2264,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		kfree_skb(skb);
 		goto errout;
 	}
-	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
-	return;
+	return skb;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+	return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+	struct net *net = dev_net(dev);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+		  gfp_t flags)
+{
+	struct sk_buff *skb;
+
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+	if (skb)
+		rtmsg_ifinfo_send(skb, dev, flags);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
-- 
cgit v1.2.3-58-ga151


From 6ffe75eb53564953e75c051e1c28676e1e56f385 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 3 Dec 2014 17:04:39 -0800
Subject: net: avoid two atomic operations in fast clones

Commit ce1a4ea3f125 ("net: avoid one atomic operation in skb_clone()")
took the wrong way to save one atomic operation.

It is actually possible to avoid two atomic operations, if we
do not change skb->fclone values, and only rely on clone_ref
content to signal if the clone is available or not.

skb_clone() can simply use the fast clone if clone_ref is 1.

kfree_skbmem() can avoid the atomic_dec_and_test() if clone_ref is 1.

Note that because we usually free the clone before the original skb,
this particular attempt is only done for the original skb to have better
branch prediction.

SKB_FCLONE_FREE is removed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Chris Mason <clm@fb.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 +--
 net/core/skbuff.c      | 35 ++++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d1e2575000b9..e9281b5b7f59 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -344,7 +344,6 @@ enum {
 	SKB_FCLONE_UNAVAILABLE,	/* skb has no fclone (from head_cache) */
 	SKB_FCLONE_ORIG,	/* orig skb (from fclone_cache) */
 	SKB_FCLONE_CLONE,	/* companion fclone skb (from fclone_cache) */
-	SKB_FCLONE_FREE,	/* this companion fclone skb is available */
 };
 
 enum {
@@ -818,7 +817,7 @@ static inline bool skb_fclone_busy(const struct sock *sk,
 	fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
 	return skb->fclone == SKB_FCLONE_ORIG &&
-	       fclones->skb2.fclone == SKB_FCLONE_CLONE &&
+	       atomic_read(&fclones->fclone_ref) > 1 &&
 	       fclones->skb2.sk == sk;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 92116dfe827c..7a338fb55cc4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		skb->fclone = SKB_FCLONE_ORIG;
 		atomic_set(&fclones->fclone_ref, 1);
 
-		fclones->skb2.fclone = SKB_FCLONE_FREE;
+		fclones->skb2.fclone = SKB_FCLONE_CLONE;
 		fclones->skb2.pfmemalloc = pfmemalloc;
 	}
 out:
@@ -541,26 +541,27 @@ static void kfree_skbmem(struct sk_buff *skb)
 	switch (skb->fclone) {
 	case SKB_FCLONE_UNAVAILABLE:
 		kmem_cache_free(skbuff_head_cache, skb);
-		break;
+		return;
 
 	case SKB_FCLONE_ORIG:
 		fclones = container_of(skb, struct sk_buff_fclones, skb1);
-		if (atomic_dec_and_test(&fclones->fclone_ref))
-			kmem_cache_free(skbuff_fclone_cache, fclones);
-		break;
-
-	case SKB_FCLONE_CLONE:
-		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 
-		/* The clone portion is available for
-		 * fast-cloning again.
+		/* We usually free the clone (TX completion) before original skb
+		 * This test would have no chance to be true for the clone,
+		 * while here, branch prediction will be good.
 		 */
-		skb->fclone = SKB_FCLONE_FREE;
+		if (atomic_read(&fclones->fclone_ref) == 1)
+			goto fastpath;
+		break;
 
-		if (atomic_dec_and_test(&fclones->fclone_ref))
-			kmem_cache_free(skbuff_fclone_cache, fclones);
+	default: /* SKB_FCLONE_CLONE */
+		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 		break;
 	}
+	if (!atomic_dec_and_test(&fclones->fclone_ref))
+		return;
+fastpath:
+	kmem_cache_free(skbuff_fclone_cache, fclones);
 }
 
 static void skb_release_head_state(struct sk_buff *skb)
@@ -872,15 +873,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	struct sk_buff_fclones *fclones = container_of(skb,
 						       struct sk_buff_fclones,
 						       skb1);
-	struct sk_buff *n = &fclones->skb2;
+	struct sk_buff *n;
 
 	if (skb_orphan_frags(skb, gfp_mask))
 		return NULL;
 
 	if (skb->fclone == SKB_FCLONE_ORIG &&
-	    n->fclone == SKB_FCLONE_FREE) {
-		n->fclone = SKB_FCLONE_CLONE;
-		atomic_inc(&fclones->fclone_ref);
+	    atomic_read(&fclones->fclone_ref) == 1) {
+		n = &fclones->skb2;
+		atomic_set(&fclones->fclone_ref, 2);
 	} else {
 		if (skb_pfmemalloc(skb))
 			gfp_mask |= __GFP_MEMALLOC;
-- 
cgit v1.2.3-58-ga151


From dbfc4fb7d578d4f224faa6b60deb40804dfdc1b1 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 6 Dec 2014 19:19:42 +0100
Subject: dst: no need to take reference on DST_NOCACHE dsts

Since commit f8864972126899 ("ipv4: fix dst race in sk_dst_get()")
DST_NOCACHE dst_entries get freed by RCU. So there is no need to get a
reference on them when we are in rcu protected sections.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 23 ++---------------------
 net/core/dst.c                  | 24 ------------------------
 net/netfilter/ipvs/ip_vs_xmit.c |  4 ++--
 3 files changed, 4 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e9281b5b7f59..ef64cec42804 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -717,9 +717,6 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 	skb->_skb_refdst = (unsigned long)dst;
 }
 
-void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst,
-			 bool force);
-
 /**
  * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
  * @skb: buffer
@@ -732,24 +729,8 @@ void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst,
  */
 static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
 {
-	__skb_dst_set_noref(skb, dst, false);
-}
-
-/**
- * skb_dst_set_noref_force - sets skb dst, without taking reference
- * @skb: buffer
- * @dst: dst entry
- *
- * Sets skb dst, assuming a reference was not taken on dst.
- * No reference is taken and no dst_release will be called. While for
- * cached dsts deferred reclaim is a basic feature, for entries that are
- * not cached it is caller's job to guarantee that last dst_release for
- * provided dst happens when nobody uses it, eg. after a RCU grace period.
- */
-static inline void skb_dst_set_noref_force(struct sk_buff *skb,
-					   struct dst_entry *dst)
-{
-	__skb_dst_set_noref(skb, dst, true);
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
 }
 
 /**
diff --git a/net/core/dst.c b/net/core/dst.c
index a028409ee438..e956ce6d1378 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 }
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
-/**
- * __skb_dst_set_noref - sets skb dst, without a reference
- * @skb: buffer
- * @dst: dst entry
- * @force: if force is set, use noref version even for DST_NOCACHE entries
- *
- * Sets skb dst, assuming a reference was not taken on dst
- * skb_dst_drop() should not dst_release() this dst
- */
-void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
-{
-	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
-	/* If dst not in cache, we must take a reference, because
-	 * dst_release() will destroy dst as soon as its refcount becomes zero
-	 */
-	if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
-		dst_hold(dst);
-		skb_dst_set(skb, dst);
-	} else {
-		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
-	}
-}
-EXPORT_SYMBOL(__skb_dst_set_noref);
-
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 1f933136155a..3aedbda7658a 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -343,7 +343,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	skb_dst_drop(skb);
 	if (noref) {
 		if (!local)
-			skb_dst_set_noref_force(skb, &rt->dst);
+			skb_dst_set_noref(skb, &rt->dst);
 		else
 			skb_dst_set(skb, dst_clone(&rt->dst));
 	} else
@@ -487,7 +487,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	skb_dst_drop(skb);
 	if (noref) {
 		if (!local)
-			skb_dst_set_noref_force(skb, &rt->dst);
+			skb_dst_set_noref(skb, &rt->dst);
 		else
 			skb_dst_set(skb, dst_clone(&rt->dst));
 	} else
-- 
cgit v1.2.3-58-ga151


From 2f33e7d59cb09d9b20d6648086b314f0ba61fb03 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sat, 6 Dec 2014 15:53:04 -0800
Subject: netdevice: Add a function to check macvlan port

Similar to a check for macvlan device, netif_is_macvlan(), add
another function to check if a device is used as macvlan port.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 29c92ee9ed56..1f49aac258f9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3641,6 +3641,11 @@ static inline bool netif_is_macvlan(struct net_device *dev)
 	return dev->priv_flags & IFF_MACVLAN;
 }
 
+static inline bool netif_is_macvlan_port(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_MACVLAN_PORT;
+}
+
 static inline bool netif_is_bond_master(struct net_device *dev)
 {
 	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
-- 
cgit v1.2.3-58-ga151


From 5933fea7aa7237ba52d67c069c39ad5c3ab7a036 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Sat, 6 Dec 2014 15:53:33 -0800
Subject: ipvlan: move the device check function into netdevice.h

Move the port check [ipvlan_dev_master()] and device check
[ipvlan_dev_slave()] functions to netdevice.h and rename them
netif_is_ipvlan_port() and netif_is_ipvlan() resp. to be
consistent with macvlan api naming.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipvlan/ipvlan.h      | 10 ----------
 drivers/net/ipvlan/ipvlan_main.c | 10 +++++-----
 include/linux/netdevice.h        | 10 ++++++++++
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index c44d29eca6c0..2729f64b3e7e 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -107,16 +107,6 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
 	return rtnl_dereference(d->rx_handler_data);
 }
 
-static inline bool ipvlan_dev_master(struct net_device *d)
-{
-	return d->priv_flags & IFF_IPVLAN_MASTER;
-}
-
-static inline bool ipvlan_dev_slave(struct net_device *d)
-{
-	return d->priv_flags & IFF_IPVLAN_SLAVE;
-}
-
 void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev);
 void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
 void ipvlan_init_secret(void);
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index a66ff8197008..4f4099d5603d 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -446,11 +446,11 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	if (!phy_dev)
 		return -ENODEV;
 
-	if (ipvlan_dev_slave(phy_dev)) {
+	if (netif_is_ipvlan(phy_dev)) {
 		struct ipvl_dev *tmp = netdev_priv(phy_dev);
 
 		phy_dev = tmp->phy_dev;
-	} else if (!ipvlan_dev_master(phy_dev)) {
+	} else if (!netif_is_ipvlan_port(phy_dev)) {
 		err = ipvlan_port_create(phy_dev);
 		if (err < 0)
 			return err;
@@ -560,7 +560,7 @@ static int ipvlan_device_event(struct notifier_block *unused,
 	struct ipvl_port *port;
 	LIST_HEAD(lst_kill);
 
-	if (!ipvlan_dev_master(dev))
+	if (!netif_is_ipvlan_port(dev))
 		return NOTIFY_DONE;
 
 	port = ipvlan_port_get_rtnl(dev);
@@ -651,7 +651,7 @@ static int ipvlan_addr6_event(struct notifier_block *unused,
 	struct net_device *dev = (struct net_device *)if6->idev->dev;
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 
-	if (!ipvlan_dev_slave(dev))
+	if (!netif_is_ipvlan(dev))
 		return NOTIFY_DONE;
 
 	if (!ipvlan || !ipvlan->port)
@@ -723,7 +723,7 @@ static int ipvlan_addr4_event(struct notifier_block *unused,
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 	struct in_addr ip4_addr;
 
-	if (!ipvlan_dev_slave(dev))
+	if (!netif_is_ipvlan(dev))
 		return NOTIFY_DONE;
 
 	if (!ipvlan || !ipvlan->port)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1f49aac258f9..c31f74d76ebd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3646,6 +3646,16 @@ static inline bool netif_is_macvlan_port(struct net_device *dev)
 	return dev->priv_flags & IFF_MACVLAN_PORT;
 }
 
+static inline bool netif_is_ipvlan(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_IPVLAN_SLAVE;
+}
+
+static inline bool netif_is_ipvlan_port(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_IPVLAN_MASTER;
+}
+
 static inline bool netif_is_bond_master(struct net_device *dev)
 {
 	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
-- 
cgit v1.2.3-58-ga151


From f4362a2c9524678f0459cf410403f8595e5cfce5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 13:26:06 -0500
Subject: switch tcp_sock->ucopy from iovec (ucopy.iov) to msghdr (ucopy.msg)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/tcp.h  | 2 +-
 net/ipv4/tcp.c       | 2 +-
 net/ipv4/tcp_input.c | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f566b8567892..5d9cc9cd2855 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -162,7 +162,7 @@ struct tcp_sock {
 	struct {
 		struct sk_buff_head	prequeue;
 		struct task_struct	*task;
-		struct iovec		*iov;
+		struct msghdr		*msg;
 		int			memory;
 		int			len;
 	} ucopy;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc13a3657e8e..4a96f3730170 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1729,7 +1729,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 				user_recv = current;
 				tp->ucopy.task = user_recv;
-				tp->ucopy.iov = msg->msg_iov;
+				tp->ucopy.msg = msg;
 			}
 
 			tp->ucopy.len = len;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 69de1a1c05c9..075ab4d5af5e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4421,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			__set_current_state(TASK_RUNNING);
 
 			local_bh_enable();
-			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+			if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
 				tp->ucopy.len -= chunk;
 				tp->copied_seq += chunk;
 				eaten = (chunk == skb->len);
@@ -4941,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 
 	local_bh_enable();
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+		err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
 	else
-		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
-						       tp->ucopy.iov);
+		err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
 
 	if (!err) {
 		tp->ucopy.len -= chunk;
-- 
cgit v1.2.3-58-ga151


From d838df2e5dcbb6ed4d82854869e9a30f9aeef6da Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 19:32:50 -0500
Subject: vmci: propagate msghdr all way down to __qp_memcpy_from_queue()

... and switch it to memcpy_to_msg()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 17 +++++++++--------
 include/linux/vmw_vmci_api.h            |  5 +++--
 net/vmw_vsock/vmci_transport.c          |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 1b7b303085d2..7aaaf51e1596 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -27,6 +27,7 @@
 #include <linux/uio.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/skbuff.h>
 
 #include "vmci_handle_array.h"
 #include "vmci_queue_pair.h"
@@ -429,11 +430,11 @@ static int __qp_memcpy_from_queue(void *dest,
 			to_copy = size - bytes_copied;
 
 		if (is_iovec) {
-			struct iovec *iov = (struct iovec *)dest;
+			struct msghdr *msg = dest;
 			int err;
 
 			/* The iovec will track bytes_copied internally. */
-			err = memcpy_toiovec(iov, (u8 *)va + page_offset,
+			err = memcpy_to_msg(msg, (u8 *)va + page_offset,
 					     to_copy);
 			if (err != 0) {
 				if (kernel_if->host)
@@ -3264,13 +3265,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_enquev);
  * of bytes dequeued or < 0 on error.
  */
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
-			  void *iov,
+			  struct msghdr *msg,
 			  size_t iov_size,
 			  int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3279,7 +3280,7 @@ ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
 		result = qp_dequeue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->consume_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_from_queue_iov,
 					   true);
 
@@ -3308,13 +3309,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_dequev);
  * of bytes peeked or < 0 on error.
  */
 ssize_t vmci_qpair_peekv(struct vmci_qp *qpair,
-			 void *iov,
+			 struct msghdr *msg,
 			 size_t iov_size,
 			 int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3323,7 +3324,7 @@ ssize_t vmci_qpair_peekv(struct vmci_qp *qpair,
 		result = qp_dequeue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->consume_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_from_queue_iov,
 					   false);
 
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index 023430e265fe..5691f752ce8f 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -24,6 +24,7 @@
 #define VMCI_KERNEL_API_VERSION_2 2
 #define VMCI_KERNEL_API_VERSION   VMCI_KERNEL_API_VERSION_2
 
+struct msghdr;
 typedef void (vmci_device_shutdown_fn) (void *device_registration,
 					void *user_data);
 
@@ -75,8 +76,8 @@ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size,
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
 			  void *iov, size_t iov_size, int mode);
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
-			  void *iov, size_t iov_size, int mode);
-ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, void *iov, size_t iov_size,
+			  struct msghdr *msg, size_t iov_size, int mode);
+ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size,
 			 int mode);
 
 #endif /* !__VMW_VMCI_API_H__ */
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c1c038952973..20a0ba3bfff6 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1840,9 +1840,9 @@ static ssize_t vmci_transport_stream_dequeue(
 	int flags)
 {
 	if (flags & MSG_PEEK)
-		return vmci_qpair_peekv(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+		return vmci_qpair_peekv(vmci_trans(vsk)->qpair, msg, len, 0);
 	else
-		return vmci_qpair_dequev(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+		return vmci_qpair_dequev(vmci_trans(vsk)->qpair, msg, len, 0);
 }
 
 static ssize_t vmci_transport_stream_enqueue(
-- 
cgit v1.2.3-58-ga151


From c0371da6047abd261bc483c744dbc7d81a116172 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 10:42:55 -0500
Subject: put iov_iter into msghdr

Note that the code _using_ ->msg_iter at that point will be very
unhappy with anything other than unshifted iovec-backed iov_iter.
We still need to convert users to proper primitives.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/algif_hash.c            |  4 ++--
 crypto/algif_skcipher.c        |  4 ++--
 drivers/net/macvtap.c          |  8 ++------
 drivers/net/tun.c              |  8 ++------
 drivers/vhost/net.c            |  8 +++-----
 fs/afs/rxrpc.c                 | 14 ++++++--------
 include/linux/skbuff.h         | 16 ++++++++++------
 include/linux/socket.h         |  3 +--
 include/net/bluetooth/l2cap.h  |  2 +-
 include/net/udplite.h          |  3 ++-
 net/atm/common.c               |  5 +----
 net/bluetooth/6lowpan.c        |  6 +++---
 net/bluetooth/a2mp.c           |  3 +--
 net/bluetooth/smp.c            |  3 +--
 net/caif/caif_socket.c         |  2 +-
 net/compat.c                   | 10 ++++++----
 net/ipv4/ip_output.c           |  6 ++++--
 net/ipv4/ping.c                |  3 ++-
 net/ipv4/raw.c                 |  3 ++-
 net/ipv4/tcp.c                 |  6 +++---
 net/ipv4/tcp_output.c          |  2 +-
 net/ipv6/ping.c                |  3 ++-
 net/ipv6/raw.c                 |  3 ++-
 net/netlink/af_netlink.c       |  2 +-
 net/packet/af_packet.c         |  7 ++-----
 net/rds/recv.c                 |  7 ++++---
 net/rds/send.c                 |  4 +---
 net/rxrpc/ar-output.c          |  8 +++-----
 net/sctp/socket.c              |  5 +----
 net/socket.c                   | 27 ++++++++++++---------------
 net/tipc/msg.c                 |  4 ++--
 net/unix/af_unix.c             | 10 ++--------
 net/vmw_vsock/vmci_transport.c |  3 ++-
 33 files changed, 90 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 35c93ff11f35..83cd2cc49c9f 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -42,7 +42,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 	unsigned long iovlen;
-	struct iovec *iov;
+	const struct iovec *iov;
 	long copied = 0;
 	int err;
 
@@ -58,7 +58,7 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 
 	ctx->more = 0;
 
-	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0;
 	     iovlen--, iov++) {
 		unsigned long seglen = iov->iov_len;
 		char __user *from = iov->iov_base;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index c3b482bee208..4f45dab24648 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -429,13 +429,13 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 	struct skcipher_sg_list *sgl;
 	struct scatterlist *sg;
 	unsigned long iovlen;
-	struct iovec *iov;
+	const struct iovec *iov;
 	int err = -EAGAIN;
 	int used;
 	long copied = 0;
 
 	lock_sock(sk);
-	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	for (iov = msg->msg_iter.iov, iovlen = msg->msg_iter.nr_segs; iovlen > 0;
 	     iovlen--, iov++) {
 		unsigned long seglen = iov->iov_len;
 		char __user *from = iov->iov_base;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index ba1e5db2152e..2c157cced81f 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1095,9 +1095,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	struct iov_iter from;
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len);
-	return macvtap_get_user(q, m, &from, m->msg_flags & MSG_DONTWAIT);
+	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
 }
 
 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -1105,12 +1103,10 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 			   int flags)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	struct iov_iter to;
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len);
-	ret = macvtap_do_read(q, &to, flags & MSG_DONTWAIT);
+	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9c58286b8a42..f3e992ed87ac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1449,13 +1449,11 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = __tun_get(tfile);
-	struct iov_iter from;
 
 	if (!tun)
 		return -EBADFD;
 
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, total_len);
-	ret = tun_get_user(tun, tfile, m->msg_control, &from,
+	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT);
 	tun_put(tun);
 	return ret;
@@ -1467,7 +1465,6 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = __tun_get(tfile);
-	struct iov_iter to;
 	int ret;
 
 	if (!tun)
@@ -1482,8 +1479,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	iov_iter_init(&to, READ, m->msg_iov, m->msg_iovlen, total_len);
-	ret = tun_do_read(tun, tfile, &to, flags & MSG_DONTWAIT);
+	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f724a35..9f06e70a2631 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -342,7 +342,6 @@ static void handle_tx(struct vhost_net *net)
 		.msg_namelen = 0,
 		.msg_control = NULL,
 		.msg_controllen = 0,
-		.msg_iov = vq->iov,
 		.msg_flags = MSG_DONTWAIT,
 	};
 	size_t len, total_len = 0;
@@ -396,8 +395,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
-		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
+		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
@@ -562,7 +561,6 @@ static void handle_rx(struct vhost_net *net)
 		.msg_namelen = 0,
 		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 		.msg_controllen = 0,
-		.msg_iov = vq->iov,
 		.msg_flags = MSG_DONTWAIT,
 	};
 	struct virtio_net_hdr_mrg_rxbuf hdr = {
@@ -600,7 +598,7 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
-			msg.msg_iovlen = 1;
+			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
 			err = sock->ops->recvmsg(NULL, sock, &msg,
 						 1, MSG_DONTWAIT | MSG_TRUNC);
 			pr_debug("Discarded rx packet: len %zd\n", sock_len);
@@ -626,7 +624,7 @@ static void handle_rx(struct vhost_net *net)
 			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
 			 * needed because recvmsg can modify msg_iov. */
 			copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
-		msg.msg_iovlen = in;
+		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, sock_len);
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* Userspace might have consumed the packet meanwhile:
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 03a3beb17004..06e14bfb3496 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
 
 			_debug("- range %u-%u%s",
 			       offset, to, msg->msg_flags ? " [more]" : "");
-			msg->msg_iov = (struct iovec *) iov;
-			msg->msg_iovlen = 1;
+			iov_iter_init(&msg->msg_iter, WRITE,
+				      (struct iovec *) iov, 1, to - offset);
 
 			/* have to change the state *before* sending the last
 			 * packet as RxRPC might give us the reply before it
@@ -384,8 +384,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= (struct iovec *) iov;
-	msg.msg_iovlen		= 1;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+		      call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
@@ -778,8 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	iov[0].iov_len		= 0;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= iov;
-	msg.msg_iovlen		= 0;
+	iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0);	/* WTF? */
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
@@ -815,8 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	iov[0].iov_len		= len;
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
-	msg.msg_iov		= iov;
-	msg.msg_iovlen		= 1;
+	iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ef64cec42804..52cf1bdac0d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2646,22 +2646,24 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
 int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
 			    struct iovec *to, int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 					struct msghdr *msg, int size)
 {
-	return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size);
+	/* XXX: stripping const */
+	return skb_copy_datagram_iovec(from, offset, (struct iovec *)msg->msg_iter.iov, size);
 }
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
 static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 			    struct msghdr *msg)
 {
-	return skb_copy_and_csum_datagram_iovec(skb, hlen, msg->msg_iov);
+	/* XXX: stripping const */
+	return skb_copy_and_csum_datagram_iovec(skb, hlen, (struct iovec *)msg->msg_iter.iov);
 }
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
-int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
-			   struct iov_iter *to, int size);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
@@ -2689,12 +2691,14 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	return memcpy_fromiovec(data, msg->msg_iov, len);
+	/* XXX: stripping const */
+	return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len);
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
 {
-	return memcpy_toiovec(msg->msg_iov, data, len);
+	/* XXX: stripping const */
+	return memcpy_toiovec((struct iovec *)msg->msg_iter.iov, data, len);
 }
 
 struct skb_checksum_ops {
diff --git a/include/linux/socket.h b/include/linux/socket.h
index de5222832be4..048d6d6eed6d 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -47,8 +47,7 @@ struct linger {
 struct msghdr {
 	void		*msg_name;	/* ptr to socket address structure */
 	int		msg_namelen;	/* size of socket address structure */
-	struct iovec	*msg_iov;	/* scatter/gather array */
-	__kernel_size_t	msg_iovlen;	/* # elements in msg_iov */
+	struct iov_iter	msg_iter;	/* data */
 	void		*msg_control;	/* ancillary data */
 	__kernel_size_t	msg_controllen;	/* ancillary data buffer length */
 	unsigned int	msg_flags;	/* flags on received message */
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 4e23674d3649..bca6fc0a3196 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -911,7 +911,7 @@ static inline int l2cap_chan_no_memcpy_fromiovec(struct l2cap_chan *chan,
 	/* Following is safe since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	struct kvec *vec = (struct kvec *)msg->msg_iov;
+	struct kvec *vec = (struct kvec *)msg->msg_iter.iov;
 
 	while (len > 0) {
 		if (vec->iov_len) {
diff --git a/include/net/udplite.h b/include/net/udplite.h
index d5baaba65b46..ae7c8d1fbcad 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -20,7 +20,8 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 				      int len, int odd, struct sk_buff *skb)
 {
 	struct msghdr *msg = from;
-	return memcpy_fromiovecend(to, msg->msg_iov, offset, len);
+	/* XXX: stripping const */
+	return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len);
 }
 
 /* Designate sk as UDP-Lite socket */
diff --git a/net/atm/common.c b/net/atm/common.c
index f59112944c91..b84057e41bd6 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -577,9 +577,6 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 	struct atm_vcc *vcc;
 	struct sk_buff *skb;
 	int eff, error;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, m->msg_iov, m->msg_iovlen, size);
 
 	lock_sock(sk);
 	if (sock->state != SS_CONNECTED) {
@@ -634,7 +631,7 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 		goto out;
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
 	ATM_SKB(skb)->atm_options = vcc->atm_options;
-	if (copy_from_iter(skb_put(skb, size), size, &from) != size) {
+	if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
 		kfree_skb(skb);
 		error = -EFAULT;
 		goto out;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index bdcaefd2db12..d8c67a5e7a02 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -537,12 +537,12 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
 	 */
 	chan->data = skb;
 
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 1;
 	iv.iov_base = skb->data;
 	iv.iov_len = skb->len;
 
+	memset(&msg, 0, sizeof(msg));
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *) &iv, 1, skb->len);
+
 	err = l2cap_chan_send(chan, &msg, skb->len);
 	if (err > 0) {
 		netdev->stats.tx_bytes += err;
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
index 5dcade511fdb..716d2a388858 100644
--- a/net/bluetooth/a2mp.c
+++ b/net/bluetooth/a2mp.c
@@ -60,8 +60,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 1;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)&iv, 1, total_len);
 
 	l2cap_chan_send(chan, &msg, total_len);
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 069b76e03b57..21f555b4df17 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -268,8 +268,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
 
 	memset(&msg, 0, sizeof(msg));
 
-	msg.msg_iov = (struct iovec *) &iv;
-	msg.msg_iovlen = 2;
+	iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iv, 2, 1 + len);
 
 	l2cap_chan_send(chan, &msg, 1 + len);
 
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index ac618b0b8a4f..769b185fefbd 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -535,7 +535,7 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto err;
 
 	ret = -EINVAL;
-	if (unlikely(msg->msg_iov->iov_base == NULL))
+	if (unlikely(msg->msg_iter.iov->iov_base == NULL))
 		goto err;
 	noblock = msg->msg_flags & MSG_DONTWAIT;
 
diff --git a/net/compat.c b/net/compat.c
index 062f157d2a6b..3236b4167a32 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -37,13 +37,14 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
 			  struct iovec **iov)
 {
 	compat_uptr_t uaddr, uiov, tmp3;
+	compat_size_t nr_segs;
 	ssize_t err;
 
 	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
 	    __get_user(uaddr, &umsg->msg_name) ||
 	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
 	    __get_user(uiov, &umsg->msg_iov) ||
-	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(nr_segs, &umsg->msg_iovlen) ||
 	    __get_user(tmp3, &umsg->msg_control) ||
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
@@ -68,14 +69,15 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (kmsg->msg_iovlen > UIO_MAXIOV)
+	if (nr_segs > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE,
-					   compat_ptr(uiov), kmsg->msg_iovlen,
+					   compat_ptr(uiov), nr_segs,
 					   UIO_FASTIOV, *iov, iov);
 	if (err >= 0)
-		kmsg->msg_iov = *iov;
+		iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE,
+			      *iov, nr_segs, err);
 	return err;
 }
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cdedcf144463..b50861b22b6b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,11 +755,13 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	struct msghdr *msg = from;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		if (memcpy_fromiovecend(to, msg->msg_iov, offset, len) < 0)
+		/* XXX: stripping const */
+		if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
 			return -EFAULT;
 	} else {
 		__wsum csum = 0;
-		if (csum_partial_copy_fromiovecend(to, msg->msg_iov, offset, len, &csum) < 0)
+		/* XXX: stripping const */
+		if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
 			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, odd);
 	}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8dd4ae0424fc..c0d82f78d364 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -811,7 +811,8 @@ back_from_confirm:
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
-	pfh.iov = msg->msg_iov;
+	/* XXX: stripping const */
+	pfh.iov = (struct iovec *)msg->msg_iter.iov;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5d83bd2fcedb..0bb68df5055d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -625,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 back_from_confirm:
 
 	if (inet->hdrincl)
-		err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+		/* XXX: stripping const */
+		err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len,
 				      &rt, msg->msg_flags);
 
 	 else {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4a96f3730170..54ba6209eea9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1085,7 +1085,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
-	struct iovec *iov;
+	const struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int iovlen, flags, err, copied = 0;
@@ -1136,8 +1136,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
 	/* Ok commence sending. */
-	iovlen = msg->msg_iovlen;
-	iov = msg->msg_iov;
+	iovlen = msg->msg_iter.nr_segs;
+	iov = msg->msg_iter.iov;
 	copied = 0;
 
 	err = -EPIPE;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd3f7e6..3e225b03eb95 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3050,7 +3050,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	syn_data->ip_summed = CHECKSUM_PARTIAL;
 	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
 	if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
-					 fo->data->msg_iov, 0, space))) {
+					 fo->data->msg_iter.iov, 0, space))) {
 		kfree_skb(syn_data);
 		goto fallback;
 	}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 5b7a1ed2aba9..2d3148378a1f 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -163,7 +163,8 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	pfh.icmph.checksum = 0;
 	pfh.icmph.un.echo.id = inet->inet_sport;
 	pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence;
-	pfh.iov = msg->msg_iov;
+	/* XXX: stripping const */
+	pfh.iov = (struct iovec *)msg->msg_iter.iov;
 	pfh.wcheck = 0;
 	pfh.family = AF_INET6;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 11a9283fda51..ee25631f8c29 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -886,7 +886,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 
 back_from_confirm:
 	if (inet->hdrincl)
-		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags);
+		/* XXX: stripping const */
+		err = rawv6_send_hdrinc(sk, (struct iovec *)msg->msg_iter.iov, len, &fl6, &dst, msg->msg_flags);
 	else {
 		lock_sock(sk);
 		err = ip6_append_data(sk, raw6_getfrag, &rfv,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 63aa5c8acf12..cc9bcf008b03 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2305,7 +2305,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	if (netlink_tx_is_mmaped(sk) &&
-	    msg->msg_iov->iov_base == NULL) {
+	    msg->msg_iter.iov->iov_base == NULL) {
 		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
 					   siocb);
 		goto out;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index efa844501136..ed2e620b61df 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2408,11 +2408,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	unsigned short gso_type = 0;
 	int hlen, tlen;
 	int extra_len = 0;
-	struct iov_iter from;
 	ssize_t n;
 
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
-
 	/*
 	 *	Get and verify the address.
 	 */
@@ -2451,7 +2448,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		len -= vnet_hdr_len;
 
 		err = -EFAULT;
-		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &from);
+		n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
 		if (n != vnet_hdr_len)
 			goto out_unlock;
 
@@ -2522,7 +2519,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	}
 
 	/* Returns -EFAULT on error */
-	err = skb_copy_datagram_from_iter(skb, offset, &from, len);
+	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
 	if (err)
 		goto out_free;
 
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 47d7b1029b33..f9ec1acd801c 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -404,7 +404,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct rds_incoming *inc = NULL;
-	struct iov_iter to;
 
 	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
 	timeo = sock_rcvtimeo(sk, nonblock);
@@ -415,6 +414,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 
 	while (1) {
+		struct iov_iter save;
 		/* If there are pending notifications, do those - and nothing else */
 		if (!list_empty(&rs->rs_notify_queue)) {
 			ret = rds_notify_queue_get(rs, msg);
@@ -450,8 +450,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
 			 &inc->i_conn->c_faddr,
 			 ntohs(inc->i_hdr.h_sport));
-		iov_iter_init(&to, READ, msg->msg_iov, msg->msg_iovlen, size);
-		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &to);
+		save = msg->msg_iter;
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
 		if (ret < 0)
 			break;
 
@@ -464,6 +464,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 			rds_inc_put(inc);
 			inc = NULL;
 			rds_stats_inc(s_recv_deliver_raced);
+			msg->msg_iter = save;
 			continue;
 		}
 
diff --git a/net/rds/send.c b/net/rds/send.c
index 4de62ead1c71..40a5629a0a13 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -934,9 +934,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	int queued = 0, allocated_mr = 0;
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
-	struct iov_iter from;
 
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, payload_len);
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
 	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
@@ -984,7 +982,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = rds_message_copy_from_user(rm, &from);
+		ret = rds_message_copy_from_user(rm, &msg->msg_iter);
 		if (ret)
 			goto out;
 	}
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 0b4b9a79f5ab..86e0f10aa2c0 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -531,14 +531,12 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	struct rxrpc_skb_priv *sp;
 	unsigned char __user *from;
 	struct sk_buff *skb;
-	struct iovec *iov;
+	const struct iovec *iov;
 	struct sock *sk = &rx->sk;
 	long timeo;
 	bool more;
 	int ret, ioc, segment, copied;
 
-	_enter(",,,{%zu},%zu", msg->msg_iovlen, len);
-
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
 	/* this should be in poll */
@@ -547,8 +545,8 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
 
-	iov = msg->msg_iov;
-	ioc = msg->msg_iovlen - 1;
+	iov = msg->msg_iter.iov;
+	ioc = msg->msg_iter.nr_segs - 1;
 	from = iov->iov_base;
 	segment = iov->iov_len;
 	iov++;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0397ac9fd98c..c92f96cda699 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1609,9 +1609,6 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	__u16 sinfo_flags = 0;
 	long timeo;
 	int err;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, msg_len);
 
 	err = 0;
 	sp = sctp_sk(sk);
@@ -1950,7 +1947,7 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* Break the message into multiple chunks of maximum size. */
-	datamsg = sctp_datamsg_from_user(asoc, sinfo, &from);
+	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
 	if (IS_ERR(datamsg)) {
 		err = PTR_ERR(datamsg);
 		goto out_free;
diff --git a/net/socket.c b/net/socket.c
index f676ac4a3701..8809afccf7fa 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -689,8 +689,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 	 * the following is safe, since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	msg->msg_iov = (struct iovec *)vec;
-	msg->msg_iovlen = num;
+	iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size);
 	result = sock_sendmsg(sock, msg, size);
 	set_fs(oldfs);
 	return result;
@@ -853,7 +852,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 	 * the following is safe, since for compiler definitions of kvec and
 	 * iovec are identical, yielding the same in-core layout and alignment
 	 */
-	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
+	iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size);
 	result = sock_recvmsg(sock, msg, size, flags);
 	set_fs(oldfs);
 	return result;
@@ -913,8 +912,7 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
-	msg->msg_iovlen = nr_segs;
+	iov_iter_init(&msg->msg_iter, READ, iov, nr_segs, size);
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 
 	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
@@ -953,8 +951,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
-	msg->msg_iovlen = nr_segs;
+	iov_iter_init(&msg->msg_iter, WRITE, iov, nr_segs, size);
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 	if (sock->type == SOCK_SEQPACKET)
 		msg->msg_flags |= MSG_EOR;
@@ -1798,8 +1795,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
 	iov.iov_base = buff;
 	iov.iov_len = len;
 	msg.msg_name = NULL;
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
+	iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len);
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
 	msg.msg_namelen = 0;
@@ -1856,10 +1852,9 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_iov = &iov;
 	iov.iov_len = size;
 	iov.iov_base = ubuf;
+	iov_iter_init(&msg.msg_iter, READ, &iov, 1, size);
 	/* Save some cycles and don't copy the address if not needed */
 	msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
 	/* We assume all kernel code knows the size of sockaddr_storage */
@@ -1993,13 +1988,14 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg,
 {
 	struct sockaddr __user *uaddr;
 	struct iovec __user *uiov;
+	size_t nr_segs;
 	ssize_t err;
 
 	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
 	    __get_user(uaddr, &umsg->msg_name) ||
 	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
 	    __get_user(uiov, &umsg->msg_iov) ||
-	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(nr_segs, &umsg->msg_iovlen) ||
 	    __get_user(kmsg->msg_control, &umsg->msg_control) ||
 	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
 	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
@@ -2029,14 +2025,15 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg,
 		kmsg->msg_namelen = 0;
 	}
 
-	if (kmsg->msg_iovlen > UIO_MAXIOV)
+	if (nr_segs > UIO_MAXIOV)
 		return -EMSGSIZE;
 
 	err = rw_copy_check_uvector(save_addr ? READ : WRITE,
-				    uiov, kmsg->msg_iovlen,
+				    uiov, nr_segs,
 				    UIO_FASTIOV, *iov, iov);
 	if (err >= 0)
-		kmsg->msg_iov = *iov;
+		iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE,
+			      *iov, nr_segs, err);
 	return err;
 }
 
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 5b0659791c07..a687b30a699c 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -194,7 +194,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
 		__skb_queue_tail(list, skb);
 		skb_copy_to_linear_data(skb, mhdr, mhsz);
 		pktpos = skb->data + mhsz;
-		if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iov, offset,
+		if (!dsz || !memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset,
 						 dsz))
 			return dsz;
 		rc = -EFAULT;
@@ -224,7 +224,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
 		if (drem < pktrem)
 			pktrem = drem;
 
-		if (memcpy_fromiovecend(pktpos, m->msg_iov, offset, pktrem)) {
+		if (memcpy_fromiovecend(pktpos, m->msg_iter.iov, offset, pktrem)) {
 			rc = -EFAULT;
 			goto error;
 		}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4450d6226602..8e1b10274b02 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1459,9 +1459,6 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	int max_level;
 	int data_len = 0;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1519,7 +1516,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	skb_put(skb, len - data_len);
 	skb->data_len = data_len;
 	skb->len = len;
-	err = skb_copy_datagram_from_iter(skb, 0, &from, len);
+	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
 	if (err)
 		goto out_free;
 
@@ -1641,9 +1638,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	bool fds_sent = false;
 	int max_level;
 	int data_len;
-	struct iov_iter from;
-
-	iov_iter_init(&from, WRITE, msg->msg_iov, msg->msg_iovlen, len);
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1700,7 +1694,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		skb_put(skb, size - data_len);
 		skb->data_len = data_len;
 		skb->len = size;
-		err = skb_copy_datagram_from_iter(skb, 0, &from, size);
+		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
 		if (err) {
 			kfree_skb(skb);
 			goto out_err;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 20a0ba3bfff6..02d2e5229240 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1850,7 +1850,8 @@ static ssize_t vmci_transport_stream_enqueue(
 	struct msghdr *msg,
 	size_t len)
 {
-	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg->msg_iov, len, 0);
+	/* XXX: stripping const */
+	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, (struct iovec *)msg->msg_iter.iov, len, 0);
 }
 
 static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
-- 
cgit v1.2.3-58-ga151


From e5a4b0bb803b39a36478451eae53a880d2663d5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 18:17:55 -0500
Subject: switch memcpy_to_msg() and skb_copy{,_and_csum}_datagram_msg() to
 primitives

... making both non-draining.  That means that tcp_recvmsg() becomes
non-draining.  And _that_ would break iscsit_do_rx_data() unless we
	a) make sure tcp_recvmsg() is uniformly non-draining (it is)
	b) make sure it copes with arbitrary (including shifted)
iov_iter (it does, all it uses is iov_iter primitives)
	c) make iscsit_do_rx_data() initialize ->msg_iter only once.

Fortunately, (c) is doable with minimal work and we are rid of one
the two places where kernel send/recvmsg users would be unhappy with
non-draining behaviour.

Actually, that makes all but one of ->recvmsg() instances iov_iter-clean.
The exception is skcipher_recvmsg() and it also isn't hard to convert
to primitives (iov_iter_get_pages() is needed there).  That'll wait
a bit - there's some interplay with ->sendmsg() path for that one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/target/iscsi/iscsi_target_util.c | 12 +++----
 include/linux/skbuff.h                   | 16 +++-------
 net/core/datagram.c                      | 54 +++++++++++---------------------
 3 files changed, 28 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index ce87ce9bdb9c..7c6a95bcb35e 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1326,21 +1326,19 @@ static int iscsit_do_rx_data(
 	struct iscsi_conn *conn,
 	struct iscsi_data_count *count)
 {
-	int data = count->data_length, rx_loop = 0, total_rx = 0, iov_len;
-	struct kvec *iov_p;
+	int data = count->data_length, rx_loop = 0, total_rx = 0;
 	struct msghdr msg;
 
 	if (!conn || !conn->sock || !conn->conn_ops)
 		return -1;
 
 	memset(&msg, 0, sizeof(struct msghdr));
-
-	iov_p = count->iov;
-	iov_len	= count->iov_count;
+	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC,
+		      count->iov, count->iov_count, data);
 
 	while (total_rx < data) {
-		rx_loop = kernel_recvmsg(conn->sock, &msg, iov_p, iov_len,
-					(data - total_rx), MSG_WAITALL);
+		rx_loop = sock_recvmsg(conn->sock, &msg,
+				      (data - total_rx), MSG_WAITALL);
 		if (rx_loop <= 0) {
 			pr_debug("rx_loop: %d total_rx: %d\n",
 				rx_loop, total_rx);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 52cf1bdac0d8..4902f2df90c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2651,17 +2651,10 @@ int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 					struct msghdr *msg, int size)
 {
-	/* XXX: stripping const */
-	return skb_copy_datagram_iovec(from, offset, (struct iovec *)msg->msg_iter.iov, size);
-}
-int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
-				     struct iovec *iov);
-static inline int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
-			    struct msghdr *msg)
-{
-	/* XXX: stripping const */
-	return skb_copy_and_csum_datagram_iovec(skb, hlen, (struct iovec *)msg->msg_iter.iov);
+	return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
 }
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
+				   struct msghdr *msg);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
@@ -2697,8 +2690,7 @@ static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
 {
-	/* XXX: stripping const */
-	return memcpy_toiovec((struct iovec *)msg->msg_iter.iov, data, len);
+	return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
 }
 
 struct skb_checksum_ops {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b6e303b0f01f..41075ed6bb52 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -615,27 +615,25 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
-				      u8 __user *to, int len,
+				      struct iov_iter *to, int len,
 				      __wsum *csump)
 {
 	int start = skb_headlen(skb);
 	int i, copy = start - offset;
 	struct sk_buff *frag_iter;
 	int pos = 0;
+	int n;
 
 	/* Copy header. */
 	if (copy > 0) {
-		int err = 0;
 		if (copy > len)
 			copy = len;
-		*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
-					       *csump, &err);
-		if (err)
+		n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+		if (n != copy)
 			goto fault;
 		if ((len -= copy) == 0)
 			return 0;
 		offset += copy;
-		to += copy;
 		pos = copy;
 	}
 
@@ -647,26 +645,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 
 		end = start + skb_frag_size(frag);
 		if ((copy = end - offset) > 0) {
-			__wsum csum2;
-			int err = 0;
-			u8  *vaddr;
+			__wsum csum2 = 0;
 			struct page *page = skb_frag_page(frag);
+			u8  *vaddr = kmap(page);
 
 			if (copy > len)
 				copy = len;
-			vaddr = kmap(page);
-			csum2 = csum_and_copy_to_user(vaddr +
-							frag->page_offset +
-							offset - start,
-						      to, copy, 0, &err);
+			n = csum_and_copy_to_iter(vaddr + frag->page_offset +
+						  offset - start, copy,
+						  &csum2, to);
 			kunmap(page);
-			if (err)
+			if (n != copy)
 				goto fault;
 			*csump = csum_block_add(*csump, csum2, pos);
 			if (!(len -= copy))
 				return 0;
 			offset += copy;
-			to += copy;
 			pos += copy;
 		}
 		start = end;
@@ -691,7 +685,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 			if ((len -= copy) == 0)
 				return 0;
 			offset += copy;
-			to += copy;
 			pos += copy;
 		}
 		start = end;
@@ -744,20 +737,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
 EXPORT_SYMBOL(__skb_checksum_complete);
 
 /**
- *	skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
+ *	skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
  *	@skb: skbuff
  *	@hlen: hardware length
- *	@iov: io vector
+ *	@msg: destination
  *
  *	Caller _must_ check that skb will fit to this iovec.
  *
  *	Returns: 0       - success.
  *		 -EINVAL - checksum failure.
- *		 -EFAULT - fault during copy. Beware, in this case iovec
- *			   can be modified!
+ *		 -EFAULT - fault during copy.
  */
-int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
-				     int hlen, struct iovec *iov)
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+				   int hlen, struct msghdr *msg)
 {
 	__wsum csum;
 	int chunk = skb->len - hlen;
@@ -765,28 +757,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 	if (!chunk)
 		return 0;
 
-	/* Skip filled elements.
-	 * Pretty silly, look at memcpy_toiovec, though 8)
-	 */
-	while (!iov->iov_len)
-		iov++;
-
-	if (iov->iov_len < chunk) {
+	if (iov_iter_count(&msg->msg_iter) < chunk) {
 		if (__skb_checksum_complete(skb))
 			goto csum_error;
-		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+		if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
 			goto fault;
 	} else {
 		csum = csum_partial(skb->data, hlen, skb->csum);
-		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+		if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
 					       chunk, &csum))
 			goto fault;
 		if (csum_fold(csum))
 			goto csum_error;
 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
 			netdev_rx_csum_fault(skb->dev);
-		iov->iov_len -= chunk;
-		iov->iov_base += chunk;
 	}
 	return 0;
 csum_error:
@@ -794,7 +778,7 @@ csum_error:
 fault:
 	return -EFAULT;
 }
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
 
 /**
  * 	datagram_poll - generic datagram poll
-- 
cgit v1.2.3-58-ga151


From d3a9632f09153bc46a8077844e05e179f1c10c3f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 18:29:54 -0500
Subject: skb_copy_datagram_iovec() can die

no callers other than itself.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  2 --
 net/core/datagram.c    | 84 --------------------------------------------------
 2 files changed, 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4902f2df90c8..ab0bc43c82a4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2644,8 +2644,6 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
 unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
-int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
-			    struct iovec *to, int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 41075ed6bb52..df493d68330c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -309,90 +309,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 }
 EXPORT_SYMBOL(skb_kill_datagram);
 
-/**
- *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Note: the iovec is modified during the copy.
- */
-int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
-			    struct iovec *to, int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	trace_skb_copy_datagram_iovec(skb, len);
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovec(to, skb->data + offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovec(to, vaddr + frag->page_offset +
-					     offset - start, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_iovec(frag_iter,
-						    offset - start,
-						    to, copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-
 /**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
-- 
cgit v1.2.3-58-ga151


From 218321e7a0838c2be974539f0a5341b398d4432b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Nov 2014 19:45:05 -0500
Subject: bury memcpy_toiovec()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  1 -
 lib/iovec.c         | 25 -------------------------
 2 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index bd8569a14c4a..a41e252396c0 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -131,7 +131,6 @@ size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len);
 int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
diff --git a/lib/iovec.c b/lib/iovec.c
index df3abd1eaa4a..2d99cb4a5006 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -27,31 +27,6 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
 }
 EXPORT_SYMBOL(memcpy_fromiovec);
 
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, iov->iov_len, len);
-			if (copy_to_user(iov->iov_base, kdata, copy))
-				return -EFAULT;
-			kdata += copy;
-			len -= copy;
-			iov->iov_len -= copy;
-			iov->iov_base += copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovec);
-
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3-58-ga151


From 605ad7f184b60cfaacbc038aa6c55ee68dee3c89 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 7 Dec 2014 12:22:18 -0800
Subject: tcp: refine TSO autosizing

Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to
control TSO size, but did this at the wrong place (sendmsg() time)

At sendmsg() time, we might have a pessimistic view of flow rate,
and we end up building very small skbs (with 2 MSS per skb).

This is bad because :

 - It sends small TSO packets even in Slow Start where rate quickly
   increases.
 - It tends to make socket write queue very big, increasing tcp_ack()
   processing time, but also increasing memory needs, not necessarily
   accounted for, as fast clones overhead is currently ignored.
 - Lower GRO efficiency and more ACK packets.

Servers with a lot of small lived connections suffer from this.

Lets instead fill skbs as much as possible (64KB of payload), but split
them at xmit time, when we have a precise idea of the flow rate.
skb split is actually quite efficient.

Patch looks bigger than necessary, because TCP Small Queue decision now
has to take place after the eventual split.

As Neal suggested, introduce a new tcp_tso_autosize() helper, so that
tcp_tso_should_defer() can be synchronized on same goal.

Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable
contains number of mss that we can put in GSO packet, and is not
related to the autosizing goal anymore.

Tested:

40 ms rtt link

nstat >/dev/null
netperf -H remote -l -2000000 -- -s 1000000
nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets"

Before patch :

Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/s

 87380 2000000 2000000    0.36         44.22
IpInReceives                    600                0.0
IpOutRequests                   599                0.0
TcpOutSegs                      1397               0.0
IpExtOutOctets                  2033249            0.0

After patch :

Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380 2000000 2000000    0.36       44.27
IpInReceives                    221                0.0
IpOutRequests                   232                0.0
TcpOutSegs                      1397               0.0
IpExtOutOctets                  2013953            0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 +-
 net/ipv4/tcp.c        | 60 ++++++++++++++++++---------------------------------
 net/ipv4/tcp_output.c | 59 ++++++++++++++++++++++++++++++++++----------------
 3 files changed, 63 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f566b8567892..3fa0a9669a3a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -130,7 +130,7 @@ struct tcp_sock {
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
-	u16	xmit_size_goal_segs; /* Goal for segmenting output packets */
+	u16	gso_segs;	/* Max number of segs per GSO packet	*/
 
 /*
  *	Header prediction flags
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc13a3657e8e..427aee33ffc0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 				       int large_allowed)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 xmit_size_goal, old_size_goal;
-
-	xmit_size_goal = mss_now;
-
-	if (large_allowed && sk_can_gso(sk)) {
-		u32 gso_size, hlen;
-
-		/* Maybe we should/could use sk->sk_prot->max_header here ? */
-		hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
-		       inet_csk(sk)->icsk_ext_hdr_len +
-		       tp->tcp_header_len;
-
-		/* Goal is to send at least one packet per ms,
-		 * not one big TSO packet every 100 ms.
-		 * This preserves ACK clocking and is consistent
-		 * with tcp_tso_should_defer() heuristic.
-		 */
-		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
-		gso_size = max_t(u32, gso_size,
-				 sysctl_tcp_min_tso_segs * mss_now);
-
-		xmit_size_goal = min_t(u32, gso_size,
-				       sk->sk_gso_max_size - 1 - hlen);
-
-		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-
-		/* We try hard to avoid divides here */
-		old_size_goal = tp->xmit_size_goal_segs * mss_now;
-
-		if (likely(old_size_goal <= xmit_size_goal &&
-			   old_size_goal + mss_now > xmit_size_goal)) {
-			xmit_size_goal = old_size_goal;
-		} else {
-			tp->xmit_size_goal_segs =
-				min_t(u16, xmit_size_goal / mss_now,
-				      sk->sk_gso_max_segs);
-			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
-		}
+	u32 new_size_goal, size_goal, hlen;
+
+	if (!large_allowed || !sk_can_gso(sk))
+		return mss_now;
+
+	/* Maybe we should/could use sk->sk_prot->max_header here ? */
+	hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+	       inet_csk(sk)->icsk_ext_hdr_len +
+	       tp->tcp_header_len;
+
+	new_size_goal = sk->sk_gso_max_size - 1 - hlen;
+	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+
+	/* We try hard to avoid divides here */
+	size_goal = tp->gso_segs * mss_now;
+	if (unlikely(new_size_goal < size_goal ||
+		     new_size_goal >= size_goal + mss_now)) {
+		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+				     sk->sk_gso_max_segs);
+		size_goal = tp->gso_segs * mss_now;
 	}
 
-	return max(xmit_size_goal, mss_now);
+	return max(size_goal, mss_now);
 }
 
 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd3f7e6..f37ecf53ee8a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
 		((nonagle & TCP_NAGLE_CORK) ||
 		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
+
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+	u32 bytes, segs;
+
+	bytes = min(sk->sk_pacing_rate >> 10,
+		    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+
+	/* Goal is to send at least one packet per ms,
+	 * not one big TSO packet every 100 ms.
+	 * This preserves ACK clocking and is consistent
+	 * with tcp_tso_should_defer() heuristic.
+	 */
+	segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+
+	return min_t(u32, segs, sk->sk_gso_max_segs);
+}
+
 /* Returns the portion of skb which can be sent right away */
 static unsigned int tcp_mss_split_point(const struct sock *sk,
 					const struct sk_buff *skb,
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  * This algorithm is from John Heffner.
  */
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
-				 bool *is_cwnd_limited)
+				 bool *is_cwnd_limited, u32 max_segs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	limit = min(send_win, cong_win);
 
 	/* If a full-sized TSO skb can be sent, do it. */
-	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-			   tp->xmit_size_goal_segs * tp->mss_cache))
+	if (limit >= max_segs * tp->mss_cache)
 		goto send_now;
 
 	/* Middle in queue won't get any more data, full sendable already? */
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	int cwnd_quota;
 	int result;
 	bool is_cwnd_limited = false;
+	u32 max_segs;
 
 	sent_pkts = 0;
 
@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		}
 	}
 
+	max_segs = tcp_tso_autosize(sk, mss_now);
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		} else {
 			if (!push_one &&
-			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+						 max_segs))
 				break;
 		}
 
+		limit = mss_now;
+		if (tso_segs > 1 && !tcp_urg_mode(tp))
+			limit = tcp_mss_split_point(sk, skb, mss_now,
+						    min_t(unsigned int,
+							  cwnd_quota,
+							  max_segs),
+						    nonagle);
+
+		if (skb->len > limit &&
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+			break;
+
 		/* TCP Small Queues :
 		 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
 		 * This allows for :
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		 * of queued bytes to ensure line rate.
 		 * One example is wifi aggregation (802.11 AMPDU)
 		 */
-		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
-			      sk->sk_pacing_rate >> 10);
+		limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+		limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
 
 		if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 
-		limit = mss_now;
-		if (tso_segs > 1 && !tcp_urg_mode(tp))
-			limit = tcp_mss_split_point(sk, skb, mss_now,
-						    min_t(unsigned int,
-							  cwnd_quota,
-							  sk->sk_gso_max_segs),
-						    nonagle);
-
-		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
-			break;
-
 		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
 			break;
 
-- 
cgit v1.2.3-58-ga151


From ffde7328a36d16e626bae8468571858d71cd010b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 9 Dec 2014 19:40:42 -0800
Subject: net: Split netdev_alloc_frag into __alloc_page_frag and add
 __napi_alloc_frag

This patch splits the netdev_alloc_frag function up so that it can be used
on one of two page frag pools instead of being fixed on the
netdev_alloc_cache.  By doing this we can add a NAPI specific function
__napi_alloc_frag that accesses a pool that is only used from softirq
context.  The advantage to this is that we do not need to call
local_irq_save/restore which can be a significant savings.

I also took the opportunity to refactor the core bits that were placed in
__alloc_page_frag.  First I updated the allocation to do either a 32K
allocation or an order 0 page.  This is based on the changes in commmit
d9b2938aa where it was found that latencies could be reduced in case of
failures.  Then I also rewrote the logic to work from the end of the page to
the start.  By doing this the size value doesn't have to be used unless we
have run out of space for page fragments.  Finally I cleaned up the atomic
bits so that we just do an atomic_sub_and_test and if that returns true then
we set the page->_count via an atomic_set.  This way we can remove the extra
conditional for the atomic_read since it would have led to an atomic_inc in
the case of success anyway.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   2 +
 net/core/skbuff.c      | 117 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 79 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ab0bc43c82a4..736cc99f3f6c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2164,6 +2164,8 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
 }
 
+void *napi_alloc_frag(unsigned int fragsz);
+
 /**
  * __dev_alloc_pages - allocate page for network Rx
  * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7a338fb55cc4..56ed17cd2151 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -336,59 +336,85 @@ struct netdev_alloc_cache {
 	unsigned int		pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
 
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
+				       gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc;
-	void *data = NULL;
-	int order;
-	unsigned long flags;
+	const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+	if (order) {
+		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+		nc->frag.size = PAGE_SIZE << (page ? order : 0);
+	}
 
-	local_irq_save(flags);
-	nc = this_cpu_ptr(&netdev_alloc_cache);
-	if (unlikely(!nc->frag.page)) {
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->frag.page = page;
+
+	return page;
+}
+
+static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+			       unsigned int fragsz, gfp_t gfp_mask)
+{
+	struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
+	struct page *page = nc->frag.page;
+	unsigned int size;
+	int offset;
+
+	if (unlikely(!page)) {
 refill:
-		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
-			gfp_t gfp = gfp_mask;
+		page = __page_frag_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+		/* if size can vary use frag.size else just use PAGE_SIZE */
+		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
 
-			if (order)
-				gfp |= __GFP_COMP | __GFP_NOWARN;
-			nc->frag.page = alloc_pages(gfp, order);
-			if (likely(nc->frag.page))
-				break;
-			if (--order < 0)
-				goto end;
-		}
-		nc->frag.size = PAGE_SIZE << order;
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
-		atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,
-			   &nc->frag.page->_count);
-		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
-		nc->frag.offset = 0;
+		atomic_add(size - 1, &page->_count);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = size;
+		nc->frag.offset = size;
 	}
 
-	if (nc->frag.offset + fragsz > nc->frag.size) {
-		if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {
-			if (!atomic_sub_and_test(nc->pagecnt_bias,
-						 &nc->frag.page->_count))
-				goto refill;
-			/* OK, page count is 0, we can safely set it */
-			atomic_set(&nc->frag.page->_count,
-				   NETDEV_PAGECNT_MAX_BIAS);
-		} else {
-			atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,
-				   &nc->frag.page->_count);
-		}
-		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
-		nc->frag.offset = 0;
+	offset = nc->frag.offset - fragsz;
+	if (unlikely(offset < 0)) {
+		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+			goto refill;
+
+		/* if size can vary use frag.size else just use PAGE_SIZE */
+		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+		/* OK, page count is 0, we can safely set it */
+		atomic_set(&page->_count, size);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = size;
+		offset = size - fragsz;
 	}
 
-	data = page_address(nc->frag.page) + nc->frag.offset;
-	nc->frag.offset += fragsz;
 	nc->pagecnt_bias--;
-end:
+	nc->frag.offset = offset;
+
+	return page_address(page) + offset;
+}
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+	unsigned long flags;
+	void *data;
+
+	local_irq_save(flags);
+	data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
 	local_irq_restore(flags);
 	return data;
 }
@@ -406,6 +432,17 @@ void *netdev_alloc_frag(unsigned int fragsz)
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
 
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+	return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
 /**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
-- 
cgit v1.2.3-58-ga151


From fd11a83dd3630ec6a60f8a702446532c5c7e1991 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 9 Dec 2014 19:40:49 -0800
Subject: net: Pull out core bits of __netdev_alloc_skb and add
 __napi_alloc_skb

This change pulls the core functionality out of __netdev_alloc_skb and
places them in a new function named __alloc_rx_skb.  The reason for doing
this is to make these bits accessible to a new function __napi_alloc_skb.
In addition __alloc_rx_skb now has a new flags value that is used to
determine which page frag pool to allocate from.  If the SKB_ALLOC_NAPI
flag is set then the NAPI pool is used.  The advantage of this is that we
do not have to use local_irq_save/restore when accessing the NAPI pool from
NAPI context.

In my test setup I saw at least 11ns of savings using the napi_alloc_skb
function versus the netdev_alloc_skb function, most of this being due to
the fact that we didn't have to call local_irq_save/restore.

The main use case for napi_alloc_skb would be for things such as copybreak
or page fragment based receive paths where an skb is allocated after the
data has been received instead of before.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  9 ++++++
 net/core/dev.c         |  2 +-
 net/core/skbuff.c      | 74 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 77 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 736cc99f3f6c..85ab7d72b54c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -151,6 +151,7 @@ struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
 struct iov_iter;
+struct napi_struct;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -673,6 +674,7 @@ struct sk_buff {
 
 #define SKB_ALLOC_FCLONE	0x01
 #define SKB_ALLOC_RX		0x02
+#define SKB_ALLOC_NAPI		0x04
 
 /* Returns true if the skb was allocated from PFMEMALLOC reserves */
 static inline bool skb_pfmemalloc(const struct sk_buff *skb)
@@ -2165,6 +2167,13 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 }
 
 void *napi_alloc_frag(unsigned int fragsz);
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+				 unsigned int length, gfp_t gfp_mask);
+static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
+					     unsigned int length)
+{
+	return __napi_alloc_skb(napi, length, GFP_ATOMIC);
+}
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 3f191da383f6..80f798da3d9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4172,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 	struct sk_buff *skb = napi->skb;
 
 	if (!skb) {
-		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 		napi->skb = skb;
 	}
 	return skb;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 56ed17cd2151..ae13ef6b3ea7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -444,10 +444,13 @@ void *napi_alloc_frag(unsigned int fragsz)
 EXPORT_SYMBOL(napi_alloc_frag);
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
- *	@dev: network device to receive on
+ *	__alloc_rx_skb - allocate an skbuff for rx
  *	@length: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *	@flags:	If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ *		allocations in case we have to fallback to __alloc_skb()
+ *		If SKB_ALLOC_NAPI is set, page fragment will be allocated
+ *		from napi_cache instead of netdev_cache.
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
  *	buffer has unspecified headroom built in. Users should allocate
@@ -456,11 +459,11 @@ EXPORT_SYMBOL(napi_alloc_frag);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
-				   unsigned int length, gfp_t gfp_mask)
+static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
+				      int flags)
 {
 	struct sk_buff *skb = NULL;
-	unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
+	unsigned int fragsz = SKB_DATA_ALIGN(length) +
 			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
@@ -469,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 		if (sk_memalloc_socks())
 			gfp_mask |= __GFP_MEMALLOC;
 
-		data = __netdev_alloc_frag(fragsz, gfp_mask);
+		data = (flags & SKB_ALLOC_NAPI) ?
+			__napi_alloc_frag(fragsz, gfp_mask) :
+			__netdev_alloc_frag(fragsz, gfp_mask);
 
 		if (likely(data)) {
 			skb = build_skb(data, fragsz);
@@ -477,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 				put_page(virt_to_head_page(data));
 		}
 	} else {
-		skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+		skb = __alloc_skb(length, gfp_mask,
 				  SKB_ALLOC_RX, NUMA_NO_NODE);
 	}
+	return skb;
+}
+
+/**
+ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	@dev: network device to receive on
+ *	@length: length to allocate
+ *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ *	Allocate a new &sk_buff and assign it a usage count of one. The
+ *	buffer has NET_SKB_PAD headroom built in. Users should allocate
+ *	the headroom they think they need without accounting for the
+ *	built in space. The built in space is used for optimisations.
+ *
+ *	%NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+				   unsigned int length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	length += NET_SKB_PAD;
+	skb = __alloc_rx_skb(length, gfp_mask, 0);
+
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
 	}
+
 	return skb;
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
+/**
+ *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ *	@napi: napi instance this buffer was allocated for
+ *	@length: length to allocate
+ *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
+ *	attempt to allocate the head from a special reserved region used
+ *	only for NAPI Rx allocation.  By doing this we can save several
+ *	CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ *	%NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+				 unsigned int length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	length += NET_SKB_PAD + NET_IP_ALIGN;
+	skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+
+	if (likely(skb)) {
+		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+		skb->dev = napi->dev;
+	}
+
+	return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		     int size, unsigned int truesize)
 {
-- 
cgit v1.2.3-58-ga151


From 0cb6c969ed9de43687abdfc63714b6fe4385d2fc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 10 Dec 2014 16:33:12 +0100
Subject: net, lib: kill arch_fast_hash library bits

As there are now no remaining users of arch_fast_hash(), lets kill
it entirely.

This basically reverts commit 71ae8aac3e19 ("lib: introduce arch
optimized hash library") and follow-up work, that is f.e., commit
237217546d44 ("lib: hash: follow-up fixups for arch hash"),
commit e3fec2f74f7f ("lib: Add missing arch generic-y entries for
asm-generic/hash.h") and last but not least commit 6a02652df511
("perf tools: Fix include for non x86 architectures").

Cc: Francesco Fusco <fusco@ntop.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/Kbuild      |  1 -
 arch/arc/include/asm/Kbuild        |  1 -
 arch/arm/include/asm/Kbuild        |  1 -
 arch/arm64/include/asm/Kbuild      |  1 -
 arch/avr32/include/asm/Kbuild      |  1 -
 arch/blackfin/include/asm/Kbuild   |  1 -
 arch/c6x/include/asm/Kbuild        |  1 -
 arch/cris/include/asm/Kbuild       |  1 -
 arch/frv/include/asm/Kbuild        |  1 -
 arch/hexagon/include/asm/Kbuild    |  1 -
 arch/ia64/include/asm/Kbuild       |  1 -
 arch/m32r/include/asm/Kbuild       |  1 -
 arch/m68k/include/asm/Kbuild       |  1 -
 arch/metag/include/asm/Kbuild      |  1 -
 arch/microblaze/include/asm/Kbuild |  1 -
 arch/mips/include/asm/Kbuild       |  1 -
 arch/mn10300/include/asm/Kbuild    |  1 -
 arch/openrisc/include/asm/Kbuild   |  1 -
 arch/parisc/include/asm/Kbuild     |  1 -
 arch/powerpc/include/asm/Kbuild    |  1 -
 arch/s390/include/asm/Kbuild       |  1 -
 arch/score/include/asm/Kbuild      |  1 -
 arch/sh/include/asm/Kbuild         |  1 -
 arch/sparc/include/asm/Kbuild      |  1 -
 arch/tile/include/asm/Kbuild       |  1 -
 arch/um/include/asm/Kbuild         |  1 -
 arch/unicore32/include/asm/Kbuild  |  1 -
 arch/x86/include/asm/hash.h        |  7 ---
 arch/x86/lib/Makefile              |  2 +-
 arch/x86/lib/hash.c                | 92 --------------------------------------
 arch/xtensa/include/asm/Kbuild     |  1 -
 include/asm-generic/hash.h         |  9 ----
 include/linux/hash.h               | 35 ---------------
 lib/Makefile                       |  2 +-
 lib/hash.c                         | 39 ----------------
 tools/perf/util/include/asm/hash.h |  6 ---
 36 files changed, 2 insertions(+), 218 deletions(-)
 delete mode 100644 arch/x86/include/asm/hash.h
 delete mode 100644 arch/x86/lib/hash.c
 delete mode 100644 include/asm-generic/hash.h
 delete mode 100644 lib/hash.c
 delete mode 100644 tools/perf/util/include/asm/hash.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 25b49725df07..76aeb8fa551a 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -3,7 +3,6 @@
 generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index b8fffc1a2ac2..be0c39e76f7c 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -12,7 +12,6 @@ generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 70cd84eb7fda..fe74c0d1e485 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -7,7 +7,6 @@ generic-y += current.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index dc770bd4f5a5..6b61091c7f4c 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -14,7 +14,6 @@ generic-y += early_ioremap.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += ftrace.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index 2a71b1cb9848..528d70d47a54 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -7,7 +7,6 @@ generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += futex.h
-generic-y += hash.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += local.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 46ed6bb9c679..4bd3c3cfc9ab 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -10,7 +10,6 @@ generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += fb.h
 generic-y += futex.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index e77e0c1dbe75..2de73391b81e 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -15,7 +15,6 @@ generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += futex.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += io.h
 generic-y += ioctl.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 2ca489eaadd3..d5f124832fd1 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -7,7 +7,6 @@ generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += kvm_para.h
 generic-y += linkage.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 3caf05cabfc5..e3f81b53578e 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -2,7 +2,6 @@
 generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 5f234a5a2320..c7a99f860b40 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -16,7 +16,6 @@ generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 747320be9d0e..9b41b4bcc073 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 generic-y += clkdev.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 3796801d6e0c..2edc793372fc 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -2,7 +2,6 @@
 generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += module.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index dbaf9f3065e8..9b6c691874bd 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += device.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 7b8111c8f937..0bf5d525b945 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -13,7 +13,6 @@ generic-y += fb.h
 generic-y += fcntl.h
 generic-y += futex.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 448143b8cabd..ab564a6db5c3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -4,7 +4,6 @@ generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 72e1cf1cab00..200efeac4181 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -3,7 +3,6 @@ generic-y += cputime.h
 generic-y += current.h
 generic-y += dma-contiguous.h
 generic-y += emergency-restart.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 54a062cb9f2c..f892d9de47d9 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -3,7 +3,6 @@ generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += cputime.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 89b61d7dc790..91f1f360a7c4 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -25,7 +25,6 @@ generic-y += fcntl.h
 generic-y += ftrace.h
 generic-y += futex.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index ffb024b8423f..8686237a3c3c 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -7,7 +7,6 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 31e8f59aff38..382b28e364dc 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,6 +1,5 @@
 
 generic-y += clkdev.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 773f86676588..c631f98fd524 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -1,7 +1,6 @@
 
 
 generic-y += clkdev.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index 46461c19f284..83ed116d414c 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -5,7 +5,6 @@ header-y +=
 generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += hash.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += preempt.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 5a6c9acff0d2..654ebb6bd5d8 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -8,7 +8,6 @@ generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
 generic-y += fcntl.h
-generic-y += hash.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index f5f94ce1692c..94f36e7086a7 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += cputime.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += exec.h
-generic-y += hash.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += linkage.h
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index e6462b8a6284..b4c488b65745 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -11,7 +11,6 @@ generic-y += errno.h
 generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 244b12c8cb39..9176fa11d49b 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -10,7 +10,6 @@ generic-y += exec.h
 generic-y += ftrace.h
 generic-y += futex.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += io.h
 generic-y += irq_regs.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 5a2bb53faa42..3e0c19d0f4c5 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -16,7 +16,6 @@ generic-y += fcntl.h
 generic-y += ftrace.h
 generic-y += futex.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
deleted file mode 100644
index e8c58f88b1d4..000000000000
--- a/arch/x86/include/asm/hash.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _ASM_X86_HASH_H
-#define _ASM_X86_HASH_H
-
-struct fast_hash_ops;
-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
-
-#endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db92793b7e23..1530afb07c85 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
deleted file mode 100644
index ff4fa51a5b1f..000000000000
--- a/arch/x86/lib/hash.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Some portions derived from code covered by the following notice:
- *
- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/hash.h>
-#include <linux/init.h>
-
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include <asm/hash.h>
-
-static inline u32 crc32_u32(u32 crc, u32 val)
-{
-#ifdef CONFIG_AS_CRC32
-	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
-#else
-	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
-#endif
-	return crc;
-}
-
-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i, tmp = 0;
-
-	for (i = 0; i < len / 4; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	switch (len & 3) {
-	case 3:
-		tmp |= *((const u8 *) p32 + 2) << 16;
-		/* fallthrough */
-	case 2:
-		tmp |= *((const u8 *) p32 + 1) << 8;
-		/* fallthrough */
-	case 1:
-		tmp |= *((const u8 *) p32);
-		seed = crc32_u32(seed, tmp);
-		break;
-	}
-
-	return seed;
-}
-
-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i;
-
-	for (i = 0; i < len; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	return seed;
-}
-
-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
-{
-	if (cpu_has_xmm4_2) {
-		ops->hash  = intel_crc4_2_hash;
-		ops->hash2 = intel_crc4_2_hash2;
-	}
-}
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 105d38922c44..86a9ab2e2ca9 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -9,7 +9,6 @@ generic-y += errno.h
 generic-y += exec.h
 generic-y += fcntl.h
 generic-y += hardirq.h
-generic-y += hash.h
 generic-y += ioctl.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
deleted file mode 100644
index b6312843dbd9..000000000000
--- a/include/asm-generic/hash.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_GENERIC_HASH_H
-#define __ASM_GENERIC_HASH_H
-
-struct fast_hash_ops;
-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
-{
-}
-
-#endif /* __ASM_GENERIC_HASH_H */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index d0494c399392..1afde47e1528 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -15,7 +15,6 @@
  */
 
 #include <asm/types.h>
-#include <asm/hash.h>
 #include <linux/compiler.h>
 
 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
@@ -84,38 +83,4 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }
 
-struct fast_hash_ops {
-	u32 (*hash)(const void *data, u32 len, u32 seed);
-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
-};
-
-/**
- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
- *			 arbitrary size. This function will eventually use an
- *			 architecture-optimized hashing implementation if
- *			 available, and trades off distribution for speed.
- *
- *	@data: buffer to hash
- *	@len: length of buffer in bytes
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
-
-/**
- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
- *			  size that is of a multiple of 32bit words. This
- *			  function will eventually use an architecture-
- *			  optimized hashing implementation if available,
- *			  and trades off distribution for speed.
- *
- *	@data: buffer to hash (must be 32bit padded)
- *	@len: number of 32bit words
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
-
 #endif /* _LINUX_HASH_H */
diff --git a/lib/Makefile b/lib/Makefile
index 0211d2bd5e17..4b9baa45a4d9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o reciprocal_div.o
+	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/hash.c b/lib/hash.c
deleted file mode 100644
index fea973f4bd57..000000000000
--- a/lib/hash.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* General purpose hashing library
- *
- * That's a start of a kernel hashing library, which can be extended
- * with further algorithms in future. arch_fast_hash{2,}() will
- * eventually resolve to an architecture optimized implementation.
- *
- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
- */
-
-#include <linux/jhash.h>
-#include <linux/hash.h>
-#include <linux/cache.h>
-
-static struct fast_hash_ops arch_hash_ops __read_mostly = {
-	.hash  = jhash,
-	.hash2 = jhash2,
-};
-
-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash);
-
-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash2(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash2);
-
-static int __init hashlib_init(void)
-{
-	setup_arch_fast_hash(&arch_hash_ops);
-	return 0;
-}
-early_initcall(hashlib_init);
diff --git a/tools/perf/util/include/asm/hash.h b/tools/perf/util/include/asm/hash.h
deleted file mode 100644
index d82b170bb216..000000000000
--- a/tools/perf/util/include/asm/hash.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_GENERIC_HASH_H
-#define __ASM_GENERIC_HASH_H
-
-/* Stub */
-
-#endif /* __ASM_GENERIC_HASH_H */
-- 
cgit v1.2.3-58-ga151


From f95b414edb18de59940dcebbefb49cf25c6d505c Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Thu, 11 Dec 2014 11:22:04 +0800
Subject: net: introduce helper macro for_each_cmsghdr

Introduce helper macro for_each_cmsghdr as a wrapper of the enumerating
cmsghdr from msghdr, just cleanup.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c        | 2 +-
 include/linux/socket.h | 4 ++++
 net/core/scm.c         | 3 +--
 net/dccp/proto.c       | 5 ++---
 net/ipv4/ip_sockglue.c | 2 +-
 net/ipv6/datagram.c    | 2 +-
 net/iucv/af_iucv.c     | 4 +---
 net/rds/send.c         | 4 ++--
 net/rxrpc/ar-output.c  | 2 +-
 net/sctp/socket.c      | 3 +--
 10 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 6a3ad8011585..bc21f520d489 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -399,7 +399,7 @@ int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con)
 {
 	struct cmsghdr *cmsg;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 		if (cmsg->cmsg_level != SOL_ALG)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 048d6d6eed6d..6e49a14365dc 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -103,6 +103,10 @@ struct cmsghdr {
 			     (cmsg)->cmsg_len <= (unsigned long) \
 			     ((mhdr)->msg_controllen - \
 			      ((char *)(cmsg) - (char *)(mhdr)->msg_control)))
+#define for_each_cmsghdr(cmsg, msg) \
+	for (cmsg = CMSG_FIRSTHDR(msg); \
+	     cmsg; \
+	     cmsg = CMSG_NXTHDR(msg, cmsg))
 
 /*
  *	Get the next cmsg header
diff --git a/net/core/scm.c b/net/core/scm.c
index b442e7e25e60..3b6899b7d810 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 	struct cmsghdr *cmsg;
 	int err;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
-	{
+	for_each_cmsghdr(cmsg, msg) {
 		err = -EINVAL;
 
 		/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 19f038739087..e171b780b499 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
 
 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
 {
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+	struct cmsghdr *cmsg;
 
 	/*
 	 * Assign an (opaque) qpolicy priority value to skb->priority.
@@ -717,8 +717,7 @@ static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
 	 */
 	skb->priority = 0;
 
-	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
-
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 640f26c6a9fe..8a89c738b7a3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -192,7 +192,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
 	int err, val;
 	struct cmsghdr *cmsg;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2464a00e36ab..100c589a2a6c 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -657,7 +657,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 	int len;
 	int err = 0;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		int addr_type;
 
 		if (!CMSG_OK(msg, cmsg)) {
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 1cd3f8107239..2e9953b2db84 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1070,9 +1070,7 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	txmsg.class = 0;
 
 	/* iterate over control messages */
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg;
-		cmsg = CMSG_NXTHDR(msg, cmsg)) {
-
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg)) {
 			err = -EINVAL;
 			goto out;
diff --git a/net/rds/send.c b/net/rds/send.c
index 40a5629a0a13..42f65d4305c8 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -826,7 +826,7 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 	int cmsg_groups = 0;
 	int retval;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
@@ -878,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 	struct cmsghdr *cmsg;
 	int ret = 0;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index 86e0f10aa2c0..e1a9373e5979 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -45,7 +45,7 @@ static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg,
 	if (msg->msg_controllen == 0)
 		return -EINVAL;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c92f96cda699..2625eccb77d5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6592,8 +6592,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
 	struct cmsghdr *cmsg;
 	struct msghdr *my_msg = (struct msghdr *)msg;
 
-	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
-	     cmsg = CMSG_NXTHDR(my_msg, cmsg)) {
+	for_each_cmsghdr(cmsg, my_msg) {
 		if (!CMSG_OK(my_msg, cmsg))
 			return -EINVAL;
 
-- 
cgit v1.2.3-58-ga151


From 3dca0f42c7baaa4e01699629da13d6556f001ebe Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 11 Dec 2014 10:57:53 +0200
Subject: net/mlx4_core: Use tasklet for user-space CQ completion events

Previously, we've fired all our completion callbacks straight from our ISR.

Some of those callbacks were lightweight (for example, mlx4_en's and
IPoIB napi callbacks), but some of them did more work (for example,
the user-space RDMA stack uverbs' completion handler). Besides that,
doing more than the minimal work in ISR is generally considered wrong,
it could even lead to a hard lockup of the system. Since when a lot
of completion events are generated by the hardware, the loop over those
events could be so long, that we'll get into a hard lockup by the system
watchdog.

In order to avoid that, add a new way of invoking completion events
callbacks. In the interrupt itself, we add the CQs which receive completion
event to a per-EQ list and schedule a tasklet. In the tasklet context
we loop over all the CQs in the list and invoke the user callback.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c           |  5 +++-
 drivers/net/ethernet/mellanox/mlx4/cq.c   | 50 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/eq.c   | 16 +++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h | 12 ++++++++
 include/linux/mlx4/device.h               |  5 ++++
 5 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 1066eec854a9..a3b70f6c4035 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -233,7 +233,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 	if (err)
 		goto err_dbmap;
 
-	cq->mcq.comp  = mlx4_ib_cq_comp;
+	if (context)
+		cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
+	else
+		cq->mcq.comp = mlx4_ib_cq_comp;
 	cq->mcq.event = mlx4_ib_cq_event;
 
 	if (context)
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 56022d647837..e71f31387ac6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -52,6 +52,51 @@
 #define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
 #define MLX4_EQ_STATE_FIRED		(10 <<  8)
 
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx4_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx4_eq_tasklet *ctx = (struct mlx4_eq_tasklet *)data;
+	struct mlx4_cq *mcq, *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (atomic_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
+{
+	unsigned long flags;
+	struct mlx4_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		atomic_inc(&cq->refcount);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 {
 	struct mlx4_cq *cq;
@@ -292,6 +337,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	cq->uar        = uar;
 	atomic_set(&cq->refcount, 1);
 	init_completion(&cq->free);
+	cq->comp = mlx4_add_cq_to_tasklet;
+	cq->tasklet_ctx.priv =
+		&priv->eq_table.eq[cq->vector].tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
 
 	cq->irq = priv->eq_table.eq[cq->vector].irq;
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index d68b264cee4d..3d275fbaf0eb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -450,7 +450,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_eqe *eqe;
-	int cqn;
+	int cqn = -1;
 	int eqes_found = 0;
 	int set_ci = 0;
 	int port;
@@ -758,6 +758,13 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
 	eq_set_ci(eq, 1);
 
+	/* cqn is 24bit wide but is initialized such that its higher bits
+	 * are ones too. Thus, if we got any event, cqn's high bits should be off
+	 * and we need to schedule the tasklet.
+	 */
+	if (!(cqn & ~0xffffff))
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
 	return eqes_found;
 }
 
@@ -971,6 +978,12 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 
 	eq->cons_index = 0;
 
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx4_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
 	return err;
 
 err_out_free_mtt:
@@ -1027,6 +1040,7 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 		}
 	}
 	synchronize_irq(eq->irq);
+	tasklet_disable(&eq->tasklet_ctx.task);
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index f48e7c3eecf8..b67ef488c30c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -43,6 +43,8 @@
 #include <linux/timer.h>
 #include <linux/semaphore.h>
 #include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
@@ -373,6 +375,14 @@ struct mlx4_srq_context {
 	__be64			db_rec_addr;
 };
 
+struct mlx4_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
 struct mlx4_eq {
 	struct mlx4_dev	       *dev;
 	void __iomem	       *doorbell;
@@ -383,6 +393,7 @@ struct mlx4_eq {
 	int			nent;
 	struct mlx4_buf_list   *page_list;
 	struct mlx4_mtt		mtt;
+	struct mlx4_eq_tasklet	tasklet_ctx;
 };
 
 struct mlx4_slave_eqe {
@@ -1146,6 +1157,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev);
 int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
 		  unsigned long timeout);
 
+void mlx4_cq_tasklet_cb(unsigned long data);
 void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index cf09e65c2901..3951b5368d7e 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -621,6 +621,11 @@ struct mlx4_cq {
 
 	atomic_t		refcount;
 	struct completion	free;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx4_cq *);
+		void		*priv;
+	} tasklet_ctx;
 };
 
 struct mlx4_qp {
-- 
cgit v1.2.3-58-ga151


From ddae0349fdb78bcc5e7219061847012aa1a29069 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.co.il>
Date: Thu, 11 Dec 2014 10:57:54 +0200
Subject: net/mlx4: Change QP allocation scheme

When using BF (Blue-Flame), the QPN overrides the VLAN, CV, and SV fields
in the WQE. Thus, BF may only be used for QPNs with bits 6,7 unset.

The current Ethernet driver code reserves a Tx QP range with 256b alignment.

This is wrong because if there are more than 64 Tx QPs in use,
QPNs >= base + 65 will have bits 6/7 set.

This problem is not specific for the Ethernet driver, any entity that
tries to reserve more than 64 BF-enabled QPs should fail. Also, using
ranges is not necessary here and is wasteful.

The new mechanism introduced here will support reservation for
"Eth QPs eligible for BF" for all drivers: bare-metal, multi-PF, and VFs
(when hypervisors support WC in VMs). The flow we use is:

1. In mlx4_en, allocate Tx QPs one by one instead of a range allocation,
   and request "BF enabled QPs" if BF is supported for the function

2. In the ALLOC_RES FW command, change param1 to:
a. param1[23:0]  - number of QPs
b. param1[31-24] - flags controlling QPs reservation

Bit 31 refers to Eth blueflame supported QPs. Those QPs must have
bits 6 and 7 unset in order to be used in Ethernet.

Bits 24-30 of the flags are currently reserved.

When a function tries to allocate a QP, it states the required attributes
for this QP. Those attributes are considered "best-effort". If an attribute,
such as Ethernet BF enabled QP, is a must-have attribute, the function has
to check that attribute is supported before trying to do the allocation.

In a lower layer of the code, mlx4_qp_reserve_range masks out the bits
which are unsupported. If SRIOV is used, the PF validates those attributes
and masks out unsupported attributes as well. In order to notify VFs which
attributes are supported, the VF uses QUERY_FUNC_CAP command. This command's
mailbox is filled by the PF, which notifies which QP allocation attributes
it supports.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.co.il>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c                  |  2 +-
 drivers/infiniband/hw/mlx4/qp.c                    | 11 ++++--
 drivers/net/ethernet/mellanox/mlx4/alloc.c         | 43 +++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     | 10 +----
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |  4 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         | 14 +++++--
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 20 +++++++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h            |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c          | 11 +++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  5 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c            | 24 +++++++++---
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  7 +++-
 include/linux/mlx4/device.h                        | 21 ++++++++++-
 14 files changed, 137 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0c3375524a64..57ecc5b204f3 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2227,7 +2227,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 					    MLX4_IB_UC_STEER_QPN_ALIGN,
-					    &ibdev->steer_qpn_base);
+					    &ibdev->steer_qpn_base, 0);
 		if (err)
 			goto err_counter;
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 9c5150c3cb31..506d1bdad227 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -802,16 +802,19 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			}
 		}
 	} else {
-		/* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
-		 * BlueFlame setup flow wrongly causes VLAN insertion. */
+		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+		 * otherwise, the WQE BlueFlame setup flow wrongly causes
+		 * VLAN insertion. */
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
-			err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn);
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
+						    init_attr->cap.max_send_wr ?
+						    MLX4_RESERVE_ETH_BF_QP : 0);
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
 			else
 				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
-							    &qpn);
+							    &qpn, 0);
 		if (err)
 			goto err_proxy;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index b0297da50304..91a8acc191bb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -76,22 +76,53 @@ void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
 	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
 }
 
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align, u32 skip_mask)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && (test_bit(start, bitmap) ||
+				   (start & skip_mask)))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap) || ((u32)i & skip_mask)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask)
 {
 	u32 obj;
 
-	if (likely(cnt == 1 && align == 1))
+	if (likely(cnt == 1 && align == 1 && !skip_mask))
 		return mlx4_bitmap_alloc(bitmap);
 
 	spin_lock(&bitmap->lock);
 
-	obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
-				bitmap->last, cnt, align - 1);
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align, skip_mask);
 	if (obj >= bitmap->max) {
 		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
 				& bitmap->mask;
-		obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
-						0, cnt, align - 1);
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align, skip_mask);
 	}
 
 	if (obj < bitmap->max) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index dccf0e1f86be..c67effb05b2f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -595,7 +595,7 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
@@ -1974,15 +1974,8 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 {
 	struct mlx4_en_port_profile *prof = priv->prof;
 	int i;
-	int err;
 	int node;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, priv->tx_ring_num, 256, &priv->base_tx_qpn);
-	if (err) {
-		en_err(priv, "failed reserving range for TX rings\n");
-		return err;
-	}
-
 	/* Create tx Rings */
 	for (i = 0; i < priv->tx_ring_num; i++) {
 		node = cpu_to_node(i % num_online_cpus());
@@ -1991,7 +1984,6 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 			goto err;
 
 		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
-					   priv->base_tx_qpn + i,
 					   prof->tx_ring_size, TXBB_SIZE,
 					   node, i))
 			goto err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 69a2e1b88ea8..a850f24fabdf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1131,7 +1131,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 	int err;
 	u32 qpn;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn);
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0);
 	if (err) {
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
@@ -1174,7 +1174,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	en_dbg(DRV, priv, "Configuring rss steering\n");
 	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
 				    priv->rx_ring_num,
-				    &rss_map->base_qpn);
+				    &rss_map->base_qpn, 0);
 	if (err) {
 		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index d0cecbdd9ba8..a308d41e4de0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -46,7 +46,7 @@
 #include "mlx4_en.h"
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
-			   struct mlx4_en_tx_ring **pring, int qpn, u32 size,
+			   struct mlx4_en_tx_ring **pring, u32 size,
 			   u16 stride, int node, int queue_index)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -112,11 +112,17 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	       ring, ring->buf, ring->size, ring->buf_size,
 	       (unsigned long long) ring->wqres.buf.direct.map);
 
-	ring->qpn = qpn;
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
+				    MLX4_RESERVE_ETH_BF_QP);
+	if (err) {
+		en_err(priv, "failed reserving qp for TX ring\n");
+		goto err_map;
+	}
+
 	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
-		goto err_map;
+		goto err_reserve;
 	}
 	ring->qp.event = mlx4_en_sqp_event;
 
@@ -143,6 +149,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	*pring = ring;
 	return 0;
 
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
 err_map:
 	mlx4_en_unmap_buffer(&ring->wqres.buf);
 err_hwq_res:
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 5089f76f060b..1469b5b5be64 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -266,10 +266,15 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
 
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET	0x6c
+
 #define QUERY_FUNC_CAP_FMR_FLAG			0x80
 #define QUERY_FUNC_CAP_FLAG_RDMA		0x40
 #define QUERY_FUNC_CAP_FLAG_ETH			0x80
 #define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+#define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
@@ -339,7 +344,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 			mlx4_get_active_ports(dev, slave);
 		/* enable rdma and ethernet interfaces, and new quota locations */
 		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
-			 QUERY_FUNC_CAP_FLAG_QUOTAS);
+			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
 
 		field = min(
@@ -401,6 +406,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
 
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 	} else
 		err = -EINVAL;
 
@@ -493,6 +500,17 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
 		func_cap->reserved_eq = size & 0xFFFFFF;
 
+		func_cap->extra_flags = 0;
+
+		/* Mailbox data from 0x6c and onward should only be treated if
+		 * QUERY_FUNC_CAP_FLAG_VALID_MAILBOX is set in func_cap->flags
+		 */
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_VALID_MAILBOX) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+		}
+
 		goto out;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 475215ee370f..0e910a452b02 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -144,6 +144,7 @@ struct mlx4_func_cap {
 	u8	port_flags;
 	u8	flags1;
 	u64	phys_port_id;
+	u32	extra_flags;
 };
 
 struct mlx4_func {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 3044f9e623cb..6a9a941ddf58 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -466,8 +466,13 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	    mlx4_is_master(dev))
 		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
 
-	if (!mlx4_is_slave(dev))
+	if (!mlx4_is_slave(dev)) {
 		mlx4_enable_cqe_eqe_stride(dev);
+		dev->caps.alloc_res_qp_mask =
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0);
+	} else {
+		dev->caps.alloc_res_qp_mask = 0;
+	}
 
 	return 0;
 }
@@ -817,6 +822,10 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
 
+	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
+	    dev->caps.bf_reg_size)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
+
 	return 0;
 
 err_mem:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index b67ef488c30c..6834da6c35ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -884,7 +884,8 @@ extern struct workqueue_struct *mlx4_wq;
 
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
-u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask);
 void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
 			    int use_rr);
 u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
@@ -970,7 +971,7 @@ int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
 		     struct mlx4_cmd_mailbox *outbox,
 		     struct mlx4_cmd_info *cmd);
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-			    int *base);
+			    int *base, u8 flags);
 void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
 void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ac48a8d91501..944a112dff37 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -778,7 +778,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
 
 int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_tx_ring **pring,
-			   int qpn, u32 size, u16 stride,
+			   u32 size, u16 stride,
 			   int node, int queue_index);
 void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_tx_ring **pring);
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 2301365c79c7..40e82edac99d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -42,6 +42,10 @@
 #include "mlx4.h"
 #include "icm.h"
 
+/* QP to support BF should have bits 6,7 cleared */
+#define MLX4_BF_QP_SKIP_MASK	0xc0
+#define MLX4_MAX_BF_QP_RANGE	0x40
+
 void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -207,26 +211,36 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-				   int *base)
+			    int *base, u8 flags)
 {
+	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
+
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_qp_table *qp_table = &priv->qp_table;
 
-	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
+		return -ENOMEM;
+
+	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
 	if (*base == -1)
 		return -ENOMEM;
 
 	return 0;
 }
 
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags)
 {
 	u64 in_param = 0;
 	u64 out_param;
 	int err;
 
+	/* Turn off all unsupported QP allocation flags */
+	flags &= dev->caps.alloc_res_qp_mask;
+
 	if (mlx4_is_mfunc(dev)) {
-		set_param_l(&in_param, cnt);
+		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
 		set_param_h(&in_param, align);
 		err = mlx4_cmd_imm(dev, in_param, &out_param,
 				   RES_QP, RES_OP_RESERVE,
@@ -238,7 +252,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
 		*base = get_param_l(&out_param);
 		return 0;
 	}
-	return __mlx4_qp_reserve_range(dev, cnt, align, base);
+	return __mlx4_qp_reserve_range(dev, cnt, align, base, flags);
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 16f617b5749e..4efbd1eca611 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -1543,16 +1543,21 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
 	int align;
 	int base;
 	int qpn;
+	u8 flags;
 
 	switch (op) {
 	case RES_OP_RESERVE:
 		count = get_param_l(&in_param) & 0xffffff;
+		/* Turn off all unsupported QP allocation flags that the
+		 * slave tries to set.
+		 */
+		flags = (get_param_l(&in_param) >> 24) & dev->caps.alloc_res_qp_mask;
 		align = get_param_h(&in_param);
 		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
 		if (err)
 			return err;
 
-		err = __mlx4_qp_reserve_range(dev, count, align, &base);
+		err = __mlx4_qp_reserve_range(dev, count, align, &base, flags);
 		if (err) {
 			mlx4_release_resource(dev, slave, RES_QP, count, 0);
 			return err;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3951b5368d7e..272aa258c036 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -194,6 +194,22 @@ enum {
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18
 };
 
+enum {
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0
+};
+
+/* bit enums for an 8-bit flags field indicating special use
+ * QPs which require special handling in qp_reserve_range.
+ * Currently, this only includes QPs used by the ETH interface,
+ * where we expect to use blueflame.  These QPs must not have
+ * bits 6 and 7 set in their qp number.
+ *
+ * This enum may use only bits 0..7.
+ */
+enum {
+	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
+};
+
 enum {
 	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
 	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1,
@@ -501,6 +517,7 @@ struct mlx4_caps {
 	u64			phys_port_id[MLX4_MAX_PORTS + 1];
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			alloc_res_qp_mask;
 };
 
 struct mlx4_buf_list {
@@ -950,8 +967,8 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
 		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
-
-int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
 int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp,
-- 
cgit v1.2.3-58-ga151


From d57febe1a47801ef8a55dbf10672850523dfaa60 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 11 Dec 2014 10:57:57 +0200
Subject: net/mlx4: Add A0 hybrid steering

A0 hybrid steering is a form of high performance flow steering.
By using this mode, mlx4 cards use a fast limited table based steering,
in order to enable fast steering of unicast packets to a QP.

In order to implement A0 hybrid steering we allocate resources
from different zones:
(1) General range
(2) Special MAC-assigned QPs [RSS, Raw-Ethernet] each has its own region.

When we create a rss QP or a raw ethernet (A0 steerable and BF ready) QP,
we try hard to allocate the QP from range (2). Otherwise, we try hard not
to allocate from this  range. However, when the system is pushed to its
limits and one needs every resource, the allocator uses every region it can.

Meaning, when we run out of raw-eth qps, the allocator allocates from the
general range (and the special-A0 area is no longer active). If we run out
of RSS qps, the mechanism tries to allocate from the raw-eth QP zone. If that
is also exhausted, the allocator will allocate from the general range
(and the A0 region is no longer active).

Note that if a raw-eth qp is allocated from the general range, it attempts
to allocate the range such that bits 6 and 7 (blueflame bits) in the
QP number are not set.

When the feature is used in SRIOV, the VF has to notify the PF what
kind of QP attributes it needs. In order to do that, along with the
"Eth QP blueflame" bit, we reserve a new "A0 steerable QP". According
to the combination of these bits, the PF tries to allocate a suitable QP.

In order to maintain backward compatibility (with older PFs), the PF
notifies which QP attributes it supports via QUERY_FUNC_CAP command.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/qp.c                |   6 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     |   3 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        |   6 +-
 drivers/net/ethernet/mellanox/mlx4/main.c      |   8 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  13 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c        | 277 +++++++++++++++++++++++--
 include/linux/mlx4/device.h                    |  10 +-
 8 files changed, 300 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 506d1bdad227..cf000b7ad64f 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -807,8 +807,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		 * VLAN insertion. */
 		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
 			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
-						    init_attr->cap.max_send_wr ?
-						    MLX4_RESERVE_ETH_BF_QP : 0);
+						    (init_attr->cap.max_send_wr ?
+						     MLX4_RESERVE_ETH_BF_QP : 0) |
+						    (init_attr->cap.max_recv_wr ?
+						     MLX4_RESERVE_A0_QP : 0));
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index c67effb05b2f..568e1f41fdd4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -595,7 +595,7 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, 0);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a850f24fabdf..a0474eb94aa3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1131,7 +1131,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 	int err;
 	u32 qpn;
 
-	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, 0);
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
+				    MLX4_RESERVE_A0_QP);
 	if (err) {
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 1469b5b5be64..622bffaa9d78 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -275,6 +275,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 #define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
 
 #define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG	(1UL << 30)
 
 /* when opcode modifier = 1 */
 #define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
@@ -406,7 +407,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
 
-		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG;
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG |
+			QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG;
 		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 	} else
 		err = -EINVAL;
@@ -509,6 +511,8 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
 			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
 				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_A0_RES_QP;
 		}
 
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6a9a941ddf58..3bfe90b95f96 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -436,6 +436,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		(1 << dev->caps.log_num_vlans) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
+		MLX4_A0_STEERING_TABLE_SIZE;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
@@ -469,7 +471,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (!mlx4_is_slave(dev)) {
 		mlx4_enable_cqe_eqe_stride(dev);
 		dev->caps.alloc_res_qp_mask =
-			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0);
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0) |
+			MLX4_RESERVE_A0_QP;
 	} else {
 		dev->caps.alloc_res_qp_mask = 0;
 	}
@@ -826,6 +829,9 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	    dev->caps.bf_reg_size)
 		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
 
+	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
+
 	return 0;
 
 err_mem:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bc1505efa436..cebd1180702b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -682,8 +682,19 @@ struct mlx4_srq_table {
 	struct mlx4_icm_table	cmpt_table;
 };
 
+enum mlx4_qp_table_zones {
+	MLX4_QP_TABLE_ZONE_GENERAL,
+	MLX4_QP_TABLE_ZONE_RSS,
+	MLX4_QP_TABLE_ZONE_RAW_ETH,
+	MLX4_QP_TABLE_ZONE_NUM
+};
+
+#define MLX4_A0_STEERING_TABLE_SIZE    256
+
 struct mlx4_qp_table {
-	struct mlx4_bitmap	bitmap;
+	struct mlx4_bitmap	*bitmap_gen;
+	struct mlx4_zone_allocator *zones;
+	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
 	u32			rdmarc_base;
 	int			rdmarc_shift;
 	spinlock_t		lock;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 8720428c9807..d8d040c366f4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -213,6 +213,7 @@ EXPORT_SYMBOL_GPL(mlx4_qp_modify);
 int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 			    int *base, u8 flags)
 {
+	u32 uid;
 	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
 
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -221,8 +222,16 @@ int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
 		return -ENOMEM;
 
-	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
-					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0);
+	uid = MLX4_QP_TABLE_ZONE_GENERAL;
+	if (flags & (u8)MLX4_RESERVE_A0_QP) {
+		if (bf_qp)
+			uid = MLX4_QP_TABLE_ZONE_RAW_ETH;
+		else
+			uid = MLX4_QP_TABLE_ZONE_RSS;
+	}
+
+	*base = mlx4_zone_alloc_entries(qp_table->zones, uid, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0, NULL);
 	if (*base == -1)
 		return -ENOMEM;
 
@@ -263,7 +272,7 @@ void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
 
 	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
 		return;
-	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR);
+	mlx4_zone_free_entries_unique(qp_table->zones, base_qpn, cnt);
 }
 
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
@@ -473,6 +482,227 @@ static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 }
 
+#define MLX4_QP_TABLE_RSS_ETH_PRIORITY 2
+#define MLX4_QP_TABLE_RAW_ETH_PRIORITY 1
+#define MLX4_QP_TABLE_RAW_ETH_SIZE     256
+
+static int mlx4_create_zones(struct mlx4_dev *dev,
+			     u32 reserved_bottom_general,
+			     u32 reserved_top_general,
+			     u32 reserved_bottom_rss,
+			     u32 start_offset_rss,
+			     u32 max_table_offset)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_bitmap (*bitmap)[MLX4_QP_TABLE_ZONE_NUM] = NULL;
+	int bitmap_initialized = 0;
+	u32 last_offset;
+	int k;
+	int err;
+
+	qp_table->zones = mlx4_zone_allocator_create(MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP);
+
+	if (NULL == qp_table->zones)
+		return -ENOMEM;
+
+	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+
+	if (NULL == bitmap) {
+		err = -ENOMEM;
+		goto free_zone;
+	}
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_GENERAL, dev->caps.num_qps,
+			       (1 << 23) - 1, reserved_bottom_general,
+			       reserved_top_general);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_GENERAL,
+				MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO |
+				MLX4_ZONE_USE_RR, 0,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_GENERAL);
+
+	if (err)
+		goto free_bitmap;
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+			       reserved_bottom_rss,
+			       reserved_bottom_rss - 1,
+			       dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+			       reserved_bottom_rss - start_offset_rss);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_RSS,
+				MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+				MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+				MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RSS_ETH_PRIORITY,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_RSS);
+
+	if (err)
+		goto free_bitmap;
+
+	last_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+	/*  We have a single zone for the A0 steering QPs area of the FW. This area
+	 *  needs to be split into subareas. One set of subareas is for RSS QPs
+	 *  (in which qp number bits 6 and/or 7 are set); the other set of subareas
+	 *  is for RAW_ETH QPs, which require that both bits 6 and 7 are zero.
+	 *  Currently, the values returned by the FW (A0 steering area starting qp number
+	 *  and A0 steering area size) are such that there are only two subareas -- one
+	 *  for RSS and one for RAW_ETH.
+	 */
+	for (k = MLX4_QP_TABLE_ZONE_RSS + 1; k < sizeof(*bitmap)/sizeof((*bitmap)[0]);
+	     k++) {
+		int size;
+		u32 offset = start_offset_rss;
+		u32 bf_mask;
+		u32 requested_size;
+
+		/* Assuming MLX4_BF_QP_SKIP_MASK is consecutive ones, this calculates
+		 * a mask of all LSB bits set until (and not including) the first
+		 * set bit of  MLX4_BF_QP_SKIP_MASK. For example, if MLX4_BF_QP_SKIP_MASK
+		 * is 0xc0, bf_mask will be 0x3f.
+		 */
+		bf_mask = (MLX4_BF_QP_SKIP_MASK & ~(MLX4_BF_QP_SKIP_MASK - 1)) - 1;
+		requested_size = min((u32)MLX4_QP_TABLE_RAW_ETH_SIZE, bf_mask + 1);
+
+		if (((last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     ((int)(max_table_offset - last_offset)) >=
+		     roundup_pow_of_two(MLX4_BF_QP_SKIP_MASK)) ||
+		    (!(last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     !((last_offset + requested_size - 1) &
+		       MLX4_BF_QP_SKIP_MASK)))
+			size = requested_size;
+		else {
+			u32 candidate_offset =
+				(last_offset | MLX4_BF_QP_SKIP_MASK | bf_mask) + 1;
+
+			if (last_offset & MLX4_BF_QP_SKIP_MASK)
+				last_offset = candidate_offset;
+
+			/* From this point, the BF bits are 0 */
+
+			if (last_offset > max_table_offset) {
+				/* need to skip */
+				size = -1;
+			} else {
+				size = min3(max_table_offset - last_offset,
+					    bf_mask - (last_offset & bf_mask),
+					    requested_size);
+				if (size < requested_size) {
+					int candidate_size;
+
+					candidate_size = min3(
+						max_table_offset - candidate_offset,
+						bf_mask - (last_offset & bf_mask),
+						requested_size);
+
+					/*  We will not take this path if last_offset was
+					 *  already set above to candidate_offset
+					 */
+					if (candidate_size > size) {
+						last_offset = candidate_offset;
+						size = candidate_size;
+					}
+				}
+			}
+		}
+
+		if (size > 0) {
+			/* mlx4_bitmap_alloc_range will find a contiguous range of "size"
+			 * QPs in which both bits 6 and 7 are zero, because we pass it the
+			 * MLX4_BF_SKIP_MASK).
+			 */
+			offset = mlx4_bitmap_alloc_range(
+					*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+					size, 1,
+					MLX4_BF_QP_SKIP_MASK);
+
+			if (offset == (u32)-1) {
+				err = -ENOMEM;
+				break;
+			}
+
+			last_offset = offset + size;
+
+			err = mlx4_bitmap_init(*bitmap + k, roundup_pow_of_two(size),
+					       roundup_pow_of_two(size) - 1, 0,
+					       roundup_pow_of_two(size) - size);
+		} else {
+			/* Add an empty bitmap, we'll allocate from different zones (since
+			 * at least one is reserved)
+			 */
+			err = mlx4_bitmap_init(*bitmap + k, 1,
+					       MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0,
+					       0);
+			mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0);
+		}
+
+		if (err)
+			break;
+
+		++bitmap_initialized;
+
+		err = mlx4_zone_add_one(qp_table->zones, *bitmap + k,
+					MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+					MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+					MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RAW_ETH_PRIORITY,
+					offset, qp_table->zones_uids + k);
+
+		if (err)
+			break;
+	}
+
+	if (err)
+		goto free_bitmap;
+
+	qp_table->bitmap_gen = *bitmap;
+
+	return err;
+
+free_bitmap:
+	for (k = 0; k < bitmap_initialized; k++)
+		mlx4_bitmap_cleanup(*bitmap + k);
+	kfree(bitmap);
+free_zone:
+	mlx4_zone_allocator_destroy(qp_table->zones);
+	return err;
+}
+
+static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (qp_table->zones) {
+		int i;
+
+		for (i = 0;
+		     i < sizeof(qp_table->zones_uids)/sizeof(qp_table->zones_uids[0]);
+		     i++) {
+			struct mlx4_bitmap *bitmap =
+				mlx4_zone_get_bitmap(qp_table->zones,
+						     qp_table->zones_uids[i]);
+
+			mlx4_zone_remove_one(qp_table->zones, qp_table->zones_uids[i]);
+			if (NULL == bitmap)
+				continue;
+
+			mlx4_bitmap_cleanup(bitmap);
+		}
+		mlx4_zone_allocator_destroy(qp_table->zones);
+		kfree(qp_table->bitmap_gen);
+		qp_table->bitmap_gen = NULL;
+		qp_table->zones = NULL;
+	}
+}
+
 int mlx4_init_qp_table(struct mlx4_dev *dev)
 {
 	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
@@ -480,22 +710,33 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int reserved_from_top = 0;
 	int reserved_from_bot;
 	int k;
+	int fixed_reserved_from_bot_rv = 0;
+	int bottom_reserved_for_rss_bitmap;
+	u32 max_table_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		MLX4_A0_STEERING_TABLE_SIZE;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
 	if (mlx4_is_slave(dev))
 		return 0;
 
-	/*
-	 * We reserve 2 extra QPs per port for the special QPs.  The
+	/* We reserve 2 extra QPs per port for the special QPs.  The
 	 * block of special QPs must be aligned to a multiple of 8, so
 	 * round up.
 	 *
 	 * We also reserve the MSB of the 24-bit QP number to indicate
 	 * that a QP is an XRC QP.
 	 */
-	dev->phys_caps.base_sqpn =
-		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+	for (k = 0; k <= MLX4_QP_REGION_BOTTOM; k++)
+		fixed_reserved_from_bot_rv += dev->caps.reserved_qps_cnt[k];
+
+	if (fixed_reserved_from_bot_rv < max_table_offset)
+		fixed_reserved_from_bot_rv = max_table_offset;
+
+	/* We reserve at least 1 extra for bitmaps that we don't have enough space for*/
+	bottom_reserved_for_rss_bitmap =
+		roundup_pow_of_two(fixed_reserved_from_bot_rv + 1);
+	dev->phys_caps.base_sqpn = ALIGN(bottom_reserved_for_rss_bitmap, 8);
 
 	{
 		int sort[MLX4_NUM_QP_REGION];
@@ -505,8 +746,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
 			sort[i] = i;
 
-		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
-			for (j = 2; j < i; ++j) {
+		for (i = MLX4_NUM_QP_REGION; i > MLX4_QP_REGION_BOTTOM; --i) {
+			for (j = MLX4_QP_REGION_BOTTOM + 2; j < i; ++j) {
 				if (dev->caps.reserved_qps_cnt[sort[j]] >
 				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
 					tmp             = sort[j];
@@ -516,13 +757,12 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 			}
 		}
 
-		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+		for (i = MLX4_QP_REGION_BOTTOM + 1; i < MLX4_NUM_QP_REGION; ++i) {
 			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
 			dev->caps.reserved_qps_base[sort[i]] = last_base;
 			reserved_from_top +=
 				dev->caps.reserved_qps_cnt[sort[i]];
 		}
-
 	}
 
        /* Reserve 8 real SQPs in both native and SRIOV modes.
@@ -541,9 +781,11 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 		return -EINVAL;
 	}
 
-	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
-			       (1 << 23) - 1, reserved_from_bot,
-			       reserved_from_top);
+	err = mlx4_create_zones(dev, reserved_from_bot, reserved_from_bot,
+				bottom_reserved_for_rss_bitmap,
+				fixed_reserved_from_bot_rv,
+				max_table_offset);
+
 	if (err)
 		return err;
 
@@ -579,7 +821,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
 	if (err)
 		goto err_mem;
-	return 0;
+
+	return err;
 
 err_mem:
 	kfree(dev->caps.qp0_tunnel);
@@ -588,6 +831,7 @@ err_mem:
 	kfree(dev->caps.qp1_proxy);
 	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
 		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	mlx4_cleanup_qp_zones(dev);
 	return err;
 }
 
@@ -597,7 +841,8 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
 		return;
 
 	mlx4_CONF_SPECIAL_QP(dev, 0);
-	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+
+	mlx4_cleanup_qp_zones(dev);
 }
 
 int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 272aa258c036..39890cddc5fa 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -195,7 +195,8 @@ enum {
 };
 
 enum {
-	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0,
+	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
 };
 
 /* bit enums for an 8-bit flags field indicating special use
@@ -207,6 +208,7 @@ enum {
  * This enum may use only bits 0..7.
  */
 enum {
+	MLX4_RESERVE_A0_QP	= 1 << 6,
 	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
 };
 
@@ -349,6 +351,8 @@ enum {
 
 enum mlx4_qp_region {
 	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_RSS_RAW_ETH,
+	MLX4_QP_REGION_BOTTOM = MLX4_QP_REGION_RSS_RAW_ETH,
 	MLX4_QP_REGION_ETH_ADDR,
 	MLX4_QP_REGION_FC_ADDR,
 	MLX4_QP_REGION_FC_EXCH,
@@ -891,7 +895,9 @@ static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
 static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
 {
 	return (qpn < dev->phys_caps.base_sqpn + 8 +
-		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev));
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev) &&
+		qpn >= dev->phys_caps.base_sqpn) ||
+	       (qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]);
 }
 
 static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
-- 
cgit v1.2.3-58-ga151


From 7d077cd34eabb2ffd05abe0f2cad01da1ef11712 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 11 Dec 2014 10:58:00 +0200
Subject: net/mlx4: Add support for A0 steering

Add the required firmware commands for A0 steering and a way to enable
that. The firmware support focuses on INIT_HCA, QUERY_HCA, QUERY_PORT,
QUERY_DEV_CAP and QUERY_FUNC_CAP commands. Those commands are used
to configure and query the device.

The different A0 DMFS (steering) modes are:

Static - optimized performance, but flow steering rules are
limited. This mode should be choosed explicitly by the user
in order to be used.

Dynamic - this mode should be explicitly choosed by the user.
In this mode, the FW works in optimized steering mode as long as
it can and afterwards automatically drops to classic (full) DMFS.

Disable - this mode should be explicitly choosed by the user.
The user instructs the system not to use optimized steering, even if
the FW supports Dynamic A0 DMFS (and thus will be able to use optimized
steering in Default A0 DMFS mode).

Default - this mode is implicitly choosed. In this mode, if the FW
supports Dynamic A0 DMFS, it'll work in this mode. Otherwise, it'll
work at Disable A0 DMFS mode.

Under SRIOV configuration, when the A0 steering mode is enabled,
older guest VF drivers who aren't using the RX QP allocation flag
(MLX4_RESERVE_A0_QP) will get a QP from the general range and
fail when attempting to register a steering rule. To avoid that,
the PF context behaviour is changed once on A0 static mode, to
require support for the allocation flag in VF drivers too.

In order to enable A0 steering, we use log_num_mgm_entry_size param.
If the value of the parameter is not positive, we treat the absolute
value of log_num_mgm_entry_size as a bit field. Setting bit 2 of this
bit field enables static A0 steering.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |   3 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c        |  48 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/fw.h        |   4 +
 drivers/net/ethernet/mellanox/mlx4/main.c      | 132 +++++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |   2 -
 drivers/net/ethernet/mellanox/mlx4/qp.c        |   4 +-
 include/linux/mlx4/device.h                    |  17 +++-
 7 files changed, 191 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 568e1f41fdd4..6ff214de1111 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2594,7 +2594,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	if (mdev->dev->caps.steering_mode ==
-	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+	    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    mdev->dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->hw_features |= NETIF_F_NTUPLE;
 
 	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 073b3d1c8b91..ef3b95bac2ad 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -144,7 +144,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[15] = "Ethernet Backplane autoneg support",
 		[16] = "CONFIG DEV support",
 		[17] = "Asymmetric EQs support",
-		[18] = "More than 80 VFs support"
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support"
 	};
 	int i;
 
@@ -680,6 +681,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
 #define QUERY_DEV_CAP_VXLAN			0x9e
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
 
 	dev_cap->flags2 = 0;
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
@@ -876,6 +879,13 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
 
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 16))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
@@ -935,6 +945,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
 	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
 	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
 	dump_dev_cap_flags2(dev, dev_cap->flags2);
@@ -996,6 +1010,7 @@ int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_c
 		port_cap->supported_port_types = field & 3;
 		port_cap->suggested_type = (field >> 3) & 1;
 		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
 		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
 		port_cap->ib_mtu	   = field & 0xf;
 		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
@@ -1530,6 +1545,12 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	struct mlx4_cmd_mailbox *mailbox;
 	__be32 *inbox;
 	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
 
 #define INIT_HCA_IN_SIZE		 0x200
 #define INIT_HCA_VERSION_OFFSET		 0x000
@@ -1563,6 +1584,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
 #define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
 #define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
 #define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
@@ -1673,8 +1695,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		/* Enable Ethernet flow steering
 		 * with udp unicast and tcp unicast
 		 */
-		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
-			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
 		/* Enable IPoIB flow steering
@@ -1684,6 +1709,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 			 INIT_HCA_FS_IB_BITS_OFFSET);
 		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
 			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
 	} else {
 		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
 		MLX4_PUT(inbox, param->log_mc_entry_sz,
@@ -1734,6 +1766,12 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	u32 dword_field;
 	int err;
 	u8 byte_field;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
 
 #define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
 #define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
@@ -1786,6 +1824,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
 		MLX4_GET(param->log_mc_table_sz, outbox,
 			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox,
+			 INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
 	} else {
 		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
 		MLX4_GET(param->log_mc_entry_sz, outbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 744398b7ab5e..794e2826609a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -60,6 +60,7 @@ struct mlx4_port_cap {
 	int vendor_oui;
 	u16 wavelength;
 	u64 trans_code;
+	u8 dmfs_optimized_state;
 };
 
 struct mlx4_dev_cap {
@@ -124,6 +125,8 @@ struct mlx4_dev_cap {
 	int max_gso_sz;
 	int max_rss_tbl_sz;
 	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
 	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 };
 
@@ -194,6 +197,7 @@ struct mlx4_init_hca_param {
 	u8  mw_enabled;  /* Enable memory windows */
 	u8  uar_page_sz; /* log pg sz in 4k chunks */
 	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
 	u64 dev_cap_enabled;
 	u16 cqe_size; /* For use only when CQE stride feature enabled */
 	u16 eqe_size; /* For use only when EQE stride feature enabled */
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6173b8072988..e25436b24ce7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -105,7 +105,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
 		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
 
 #define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
-					 MLX4_FUNC_CAP_EQE_CQE_STRIDE)
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
 static char mlx4_version[] =
 	DRV_NAME ": Mellanox ConnectX core driver v"
@@ -463,8 +464,28 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		(1 << dev->caps.log_num_vlans) *
 		dev->caps.num_ports;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
-		MLX4_A0_STEERING_TABLE_SIZE;
+		dev->caps.dmfs_high_rate_qpn_range;
 
 	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
 		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
@@ -753,7 +774,8 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
 	    PF_CONTEXT_BEHAVIOUR_MASK) {
-		mlx4_err(dev, "Unknown pf context behaviour\n");
+		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
+			 func_cap.pf_context_behaviour, PF_CONTEXT_BEHAVIOUR_MASK);
 		return -ENOSYS;
 	}
 
@@ -1640,10 +1662,46 @@ static int choose_log_fs_mgm_entry_size(int qp_per_entry)
 	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
 }
 
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
 static void choose_steering_mode(struct mlx4_dev *dev,
 				 struct mlx4_dev_cap *dev_cap)
 {
-	if (mlx4_log_num_mgm_entry_size == -1 &&
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
+	}
+
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
 	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
@@ -1656,6 +1714,9 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 		dev->caps.fs_log_max_ucast_qp_range_size =
 			dev_cap->fs_log_max_ucast_qp_range_size;
 	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
 		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
 		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
 			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
@@ -1682,7 +1743,8 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 				       struct mlx4_dev_cap *dev_cap)
 {
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
-	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS &&
+	    dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
 	else
 		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
@@ -1691,6 +1753,35 @@ static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
 		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
 }
 
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
 static int mlx4_init_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_mod_stat_cfg   mlx4_cfg;
@@ -1743,6 +1834,10 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		choose_steering_mode(dev, &dev_cap);
 		choose_tunnel_offload_mode(dev, &dev_cap);
 
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
 		err = mlx4_get_phys_port_id(dev);
 		if (err)
 			mlx4_err(dev, "Fail to get physical port id\n");
@@ -1829,6 +1924,24 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
 			}
 		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_dbg(dev, "DMFS high rate steer mode is: %s\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
 	} else {
 		err = mlx4_init_slave(dev);
 		if (err) {
@@ -3201,10 +3314,11 @@ static int __init mlx4_verify_params(void)
 		port_type_array[0] = true;
 	}
 
-	if (mlx4_log_num_mgm_entry_size != -1 &&
-	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
-	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
-		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-1 or %d..%d)\n",
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
 			mlx4_log_num_mgm_entry_size,
 			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
 			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index cebd1180702b..bdd4eea2247c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -689,8 +689,6 @@ enum mlx4_qp_table_zones {
 	MLX4_QP_TABLE_ZONE_NUM
 };
 
-#define MLX4_A0_STEERING_TABLE_SIZE    256
-
 struct mlx4_qp_table {
 	struct mlx4_bitmap	*bitmap_gen;
 	struct mlx4_zone_allocator *zones;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index d8d040c366f4..1586ecce13c7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -712,8 +712,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	int k;
 	int fixed_reserved_from_bot_rv = 0;
 	int bottom_reserved_for_rss_bitmap;
-	u32 max_table_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
-		MLX4_A0_STEERING_TABLE_SIZE;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 39890cddc5fa..25c791e295fd 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -117,6 +117,14 @@ enum {
 	MLX4_STEERING_MODE_DEVICE_MANAGED
 };
 
+enum {
+	MLX4_STEERING_DMFS_A0_DEFAULT,
+	MLX4_STEERING_DMFS_A0_DYNAMIC,
+	MLX4_STEERING_DMFS_A0_STATIC,
+	MLX4_STEERING_DMFS_A0_DISABLE,
+	MLX4_STEERING_DMFS_A0_NOT_SUPPORTED
+};
+
 static inline const char *mlx4_steering_mode_str(int steering_mode)
 {
 	switch (steering_mode) {
@@ -191,7 +199,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
-	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19
 };
 
 enum {
@@ -225,7 +234,8 @@ enum {
 
 enum {
 	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
-	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1,
+	MLX4_FUNC_CAP_DMFS_A0_STATIC	= 1L << 2
 };
 
 
@@ -482,6 +492,7 @@ struct mlx4_caps {
 	int			reserved_mcgs;
 	int			num_qp_per_mgm;
 	int			steering_mode;
+	int			dmfs_high_steer_mode;
 	int			fs_log_max_ucast_qp_range_size;
 	int			num_pds;
 	int			reserved_pds;
@@ -522,6 +533,8 @@ struct mlx4_caps {
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
+	u32			dmfs_high_rate_qpn_base;
+	u32			dmfs_high_rate_qpn_range;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3-58-ga151