diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 20 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 40 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 3 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 155 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag.c | 73 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag.h | 65 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 315 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h | 26 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 |
14 files changed, 604 insertions, 109 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 17f1a8b28c0a..1a16f6d73cbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -30,7 +30,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o -mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o # # Core extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 75dad8797720..9d38e62cdf24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -179,7 +179,7 @@ int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq) /* state lock cannot be grabbed within this function. * It can cause a dead lock or a read-after-free. */ -int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx) +static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx) { return err_ctx->recover(err_ctx->sq); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index bdcc5e79328d..fa2a3c444cdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -54,12 +54,24 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, struct neighbour *n = NULL; #if IS_ENABLED(CONFIG_INET) + struct mlx5_core_dev *mdev = priv->mdev; + struct net_device *uplink_dev; int ret; + if (mlx5_lag_is_multipath(mdev)) { + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); + fl4->flowi4_oif = uplink_dev->ifindex; + } + rt = ip_route_output_key(dev_net(mirred_dev), fl4); ret = PTR_ERR_OR_ZERO(rt); if (ret) return ret; + + if (mlx5_lag_is_multipath(mdev) && !rt->rt_gateway) + return -ENETUNREACH; #else return -EOPNOTSUPP; #endif @@ -295,7 +307,9 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, if (!(nud_state & NUD_VALID)) { neigh_event_send(n, NULL); - err = -EAGAIN; + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ goto out; } @@ -408,7 +422,9 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, if (!(nud_state & NUD_VALID)) { neigh_event_send(n, NULL); - err = -EAGAIN; + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 4d033e01f6ab..a1a3e2774989 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1573,6 +1573,8 @@ static int mlx5e_init_rep_tx(struct mlx5e_priv *priv) if (rpriv->rep->vport == MLX5_VPORT_UPLINK) { uplink_priv = &rpriv->uplink_priv; + INIT_LIST_HEAD(&uplink_priv->unready_flows); + /* init shared tc flow table */ err = mlx5e_tc_esw_init(&uplink_priv->tc_ht); if (err) @@ -1632,27 +1634,38 @@ static void mlx5e_vf_rep_enable(struct mlx5e_priv *priv) static int uplink_rep_async_event(struct notifier_block *nb, unsigned long event, void *data) { struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb); - struct mlx5_eqe *eqe = data; - if (event != MLX5_EVENT_TYPE_PORT_CHANGE) - return NOTIFY_DONE; + if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { + struct mlx5_eqe *eqe = data; - switch (eqe->sub_type) { - case MLX5_PORT_CHANGE_SUBTYPE_DOWN: - case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: - queue_work(priv->wq, &priv->update_carrier_work); - break; - default: - return NOTIFY_DONE; + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + queue_work(priv->wq, &priv->update_carrier_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; } - return NOTIFY_OK; + if (event == MLX5_DEV_EVENT_PORT_AFFINITY) { + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work); + + return NOTIFY_OK; + } + + return NOTIFY_DONE; } static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) { struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_rep_priv *rpriv = priv->ppriv; u16 max_mtu; netdev->min_mtu = ETH_MIN_MTU; @@ -1660,6 +1673,9 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); mlx5e_set_dev_port_mtu(priv); + INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work, + mlx5e_tc_reoffload_flows_work); + mlx5_lag_add(mdev, netdev); priv->events_nb.notifier_call = uplink_rep_async_event; mlx5_notifier_register(mdev, &priv->events_nb); @@ -1672,11 +1688,13 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_rep_priv *rpriv = priv->ppriv; #ifdef CONFIG_MLX5_CORE_EN_DCB mlx5e_dcbnl_delete_app(priv); #endif mlx5_notifier_unregister(mdev, &priv->events_nb); + cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work); mlx5_lag_remove(mdev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 1aa3e110bb97..83b573b1abac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -74,6 +74,9 @@ struct mlx5_rep_uplink_priv { struct notifier_block netdevice_nb; struct mlx5_tun_entropy tun_entropy; + + struct list_head unready_flows; + struct work_struct reoffload_flows_work; }; struct mlx5e_rep_priv { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index b38986e18dd7..b4967a0ff8c7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -75,6 +75,7 @@ enum { MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 2), MLX5E_TC_FLOW_SLOW = BIT(MLX5E_TC_FLOW_BASE + 3), MLX5E_TC_FLOW_DUP = BIT(MLX5E_TC_FLOW_BASE + 4), + MLX5E_TC_FLOW_NOT_READY = BIT(MLX5E_TC_FLOW_BASE + 5), }; #define MLX5E_TC_MAX_SPLITS 1 @@ -116,6 +117,7 @@ struct mlx5e_tc_flow { struct list_head mod_hdr; /* flows sharing the same mod hdr ID */ struct list_head hairpin; /* flows sharing the same hairpin */ struct list_head peer; /* flows with peer flow */ + struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */ union { struct mlx5_esw_flow_attr esw_attr[0]; struct mlx5_nic_flow_attr nic_attr[0]; @@ -850,12 +852,12 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, int out_index); static int mlx5e_attach_encap(struct mlx5e_priv *priv, - struct ip_tunnel_info *tun_info, - struct net_device *mirred_dev, - struct net_device **encap_dev, struct mlx5e_tc_flow *flow, + struct net_device *mirred_dev, + int out_index, struct netlink_ext_ack *extack, - int out_index); + struct net_device **encap_dev, + bool *encap_valid); static struct mlx5_flow_handle * mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw, @@ -927,6 +929,26 @@ mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, flow->flags &= ~MLX5E_TC_FLOW_SLOW; } +static void add_unready_flow(struct mlx5e_tc_flow *flow) +{ + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *rpriv; + struct mlx5_eswitch *esw; + + esw = flow->priv->mdev->priv.eswitch; + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &rpriv->uplink_priv; + + flow->flags |= MLX5E_TC_FLOW_NOT_READY; + list_add_tail(&flow->unready, &uplink_priv->unready_flows); +} + +static void remove_unready_flow(struct mlx5e_tc_flow *flow) +{ + list_del(&flow->unready); + flow->flags &= ~MLX5E_TC_FLOW_NOT_READY; +} + static int mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, @@ -941,7 +963,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5_fc *counter = NULL; struct mlx5e_rep_priv *rpriv; struct mlx5e_priv *out_priv; - int err = 0, encap_err = 0; + bool encap_valid = true; + int err = 0; int out_index; if (!mlx5_eswitch_prios_supported(esw) && attr->prio != 1) { @@ -970,14 +993,11 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, mirred_ifindex = parse_attr->mirred_ifindex[out_index]; out_dev = __dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); - err = mlx5e_attach_encap(priv, - &parse_attr->tun_info[out_index], - out_dev, &encap_dev, flow, - extack, out_index); - if (err && err != -EAGAIN) + err = mlx5e_attach_encap(priv, flow, out_dev, out_index, + extack, &encap_dev, &encap_valid); + if (err) goto err_attach_encap; - if (err == -EAGAIN) - encap_err = err; + out_priv = netdev_priv(encap_dev); rpriv = out_priv->ppriv; attr->dests[out_index].rep = rpriv->rep; @@ -1005,10 +1025,11 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, attr->counter = counter; } - /* we get here if (1) there's no error or when - * (2) there's an encap action and we're on -EAGAIN (no valid neigh) + /* we get here if one of the following takes place: + * (1) there's no error + * (2) there's an encap action and we don't have valid neigh */ - if (encap_err == -EAGAIN) { + if (!encap_valid) { /* continue with goto slow path rule instead */ struct mlx5_esw_flow_attr slow_attr; @@ -1048,6 +1069,12 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, struct mlx5_esw_flow_attr slow_attr; int out_index; + if (flow->flags & MLX5E_TC_FLOW_NOT_READY) { + remove_unready_flow(flow); + kvfree(attr->parse_attr); + return; + } + if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) { if (flow->flags & MLX5E_TC_FLOW_SLOW) mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr); @@ -2333,31 +2360,37 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv, peer_priv = netdev_priv(peer_netdev); return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) && - (priv->netdev->netdev_ops == peer_netdev->netdev_ops) && - same_hw_devs(priv, peer_priv) && - MLX5_VPORT_MANAGER(peer_priv->mdev) && - (peer_priv->mdev->priv.eswitch->mode == SRIOV_OFFLOADS)); + mlx5e_eswitch_rep(priv->netdev) && + mlx5e_eswitch_rep(peer_netdev) && + same_hw_devs(priv, peer_priv)); } static int mlx5e_attach_encap(struct mlx5e_priv *priv, - struct ip_tunnel_info *tun_info, - struct net_device *mirred_dev, - struct net_device **encap_dev, struct mlx5e_tc_flow *flow, + struct net_device *mirred_dev, + int out_index, struct netlink_ext_ack *extack, - int out_index) + struct net_device **encap_dev, + bool *encap_valid) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - unsigned short family = ip_tunnel_info_af(tun_info); struct mlx5_esw_flow_attr *attr = flow->esw_attr; - struct ip_tunnel_key *key = &tun_info->key; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct ip_tunnel_info *tun_info; + struct ip_tunnel_key *key; struct mlx5e_encap_entry *e; + unsigned short family; uintptr_t hash_key; bool found = false; int err = 0; + parse_attr = attr->parse_attr; + tun_info = &parse_attr->tun_info[out_index]; + family = ip_tunnel_info_af(tun_info); + key = &tun_info->key; + hash_key = hash_encap_info(key); hash_for_each_possible_rcu(esw->offloads.encap_tbl, e, @@ -2388,7 +2421,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, else if (family == AF_INET6) err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e); - if (err && err != -EAGAIN) + if (err) goto out_err; hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key); @@ -2400,8 +2433,9 @@ attach_flow: if (e->flags & MLX5_ENCAP_ENTRY_VALID) { attr->dests[out_index].encap_id = e->encap_id; attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; + *encap_valid = true; } else { - err = -EAGAIN; + *encap_valid = false; } return err; @@ -2697,8 +2731,15 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow) bool esw_paired = mlx5_devcom_is_paired(attr->in_mdev->priv.devcom, MLX5_DEVCOM_ESW_OFFLOADS); - return esw_paired && mlx5_lag_is_sriov(attr->in_mdev) && - (is_rep_ingress || act_is_encap); + if (!esw_paired) + return false; + + if ((mlx5_lag_is_sriov(attr->in_mdev) || + mlx5_lag_is_multipath(attr->in_mdev)) && + (is_rep_ingress || act_is_encap)) + return true; + + return false; } static int @@ -2793,8 +2834,12 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv, goto err_free; err = mlx5e_tc_add_fdb_flow(priv, flow, extack); - if (err) - goto err_free; + if (err) { + if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev))) + goto err_free; + + add_unready_flow(flow); + } return flow; @@ -2806,7 +2851,8 @@ out: } static int mlx5e_tc_add_fdb_peer_flow(struct tc_cls_flower_offload *f, - struct mlx5e_tc_flow *flow) + struct mlx5e_tc_flow *flow, + u16 flow_flags) { struct mlx5e_priv *priv = flow->priv, *peer_priv; struct mlx5_eswitch *esw = priv->mdev->priv.eswitch, *peer_esw; @@ -2835,7 +2881,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct tc_cls_flower_offload *f, in_mdev = priv->mdev; parse_attr = flow->esw_attr->parse_attr; - peer_flow = __mlx5e_add_fdb_flow(peer_priv, f, flow->flags, + peer_flow = __mlx5e_add_fdb_flow(peer_priv, f, flow_flags, parse_attr->filter_dev, flow->esw_attr->in_rep, in_mdev); if (IS_ERR(peer_flow)) { @@ -2873,7 +2919,7 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv, return PTR_ERR(flow); if (is_peer_flow_needed(flow)) { - err = mlx5e_tc_add_fdb_peer_flow(f, flow); + err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags); if (err) { mlx5e_tc_del_fdb_flow(priv, flow); goto out; @@ -3038,23 +3084,25 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, struct mlx5_eswitch *peer_esw; struct mlx5e_tc_flow *flow; struct mlx5_fc *counter; - u64 bytes; - u64 packets; - u64 lastuse; + u64 lastuse = 0; + u64 packets = 0; + u64 bytes = 0; flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params); if (!flow || !same_flow_direction(flow, flags)) return -EINVAL; - if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED)) - return 0; - - counter = mlx5e_tc_get_counter(flow); - if (!counter) - return 0; + if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) { + counter = mlx5e_tc_get_counter(flow); + if (!counter) + return 0; - mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + } + /* Under multipath it's possible for one rule to be currently + * un-offloaded while the other rule is offloaded. + */ peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); if (!peer_esw) goto out; @@ -3066,6 +3114,8 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, u64 lastuse2; counter = mlx5e_tc_get_counter(flow->peer_flow); + if (!counter) + goto no_peer_counter; mlx5_fc_query_cached(counter, &bytes2, &packets2, &lastuse2); bytes += bytes2; @@ -3073,8 +3123,8 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, lastuse = max_t(u64, lastuse, lastuse2); } +no_peer_counter: mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); - out: flow_stats_update(&f->stats, bytes, packets, lastuse); @@ -3196,3 +3246,18 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw) list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows, peer) __mlx5e_tc_del_fdb_peer_flow(flow); } + +void mlx5e_tc_reoffload_flows_work(struct work_struct *work) +{ + struct mlx5_rep_uplink_priv *rpriv = + container_of(work, struct mlx5_rep_uplink_priv, + reoffload_flows_work); + struct mlx5e_tc_flow *flow, *tmp; + + rtnl_lock(); + list_for_each_entry_safe(flow, tmp, &rpriv->unready_flows, unready) { + if (!mlx5e_tc_add_fdb_flow(flow->priv, flow, NULL)) + remove_unready_flow(flow); + } + rtnl_unlock(); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index d2d87f978c06..f62e81902d27 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -72,6 +72,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe); int mlx5e_tc_num_filters(struct mlx5e_priv *priv, int flags); +void mlx5e_tc_reoffload_flows_work(struct work_struct *work); #else /* CONFIG_MLX5_ESWITCH */ static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 84200d93cf9d..d0b28251abf2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2476,3 +2476,10 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) return false; } + +bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, + struct mlx5_core_dev *dev1) +{ + return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS && + dev1->priv.eswitch->mode == SRIOV_OFFLOADS); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 5f82e637410b..3f3cd32ae60a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -371,6 +371,8 @@ static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1); +bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, + struct mlx5_core_dev *dev1); #define MLX5_DEBUG_ESWITCH_MASK BIT(3) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index 04c5aca7f8c5..48aa6e030bcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -35,37 +35,8 @@ #include <linux/mlx5/vport.h> #include "mlx5_core.h" #include "eswitch.h" - -enum { - MLX5_LAG_FLAG_ROCE = 1 << 0, - MLX5_LAG_FLAG_SRIOV = 1 << 1, -}; - -#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV) - -struct lag_func { - struct mlx5_core_dev *dev; - struct net_device *netdev; -}; - -/* Used for collection of netdev event info. */ -struct lag_tracker { - enum netdev_lag_tx_type tx_type; - struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; - bool is_bonded; -}; - -/* LAG data of a ConnectX card. - * It serves both its phys functions. - */ -struct mlx5_lag { - u8 flags; - u8 v2p_map[MLX5_MAX_PORTS]; - struct lag_func pf[MLX5_MAX_PORTS]; - struct lag_tracker tracker; - struct delayed_work bond_work; - struct notifier_block nb; -}; +#include "lag.h" +#include "lag_mp.h" /* General purpose, use for short periods of time. * Beware of lock dependencies (preferably, no locks should be acquired @@ -147,13 +118,8 @@ static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); } -static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev) -{ - return dev->priv.lag; -} - -static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, - struct net_device *ndev) +int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, + struct net_device *ndev) { int i; @@ -174,11 +140,6 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev) return !!(ldev->flags & MLX5_LAG_FLAG_SRIOV); } -static bool __mlx5_lag_is_active(struct mlx5_lag *ldev) -{ - return !!(ldev->flags & MLX5_LAG_MODE_FLAGS); -} - static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, u8 *port1, u8 *port2) { @@ -195,8 +156,8 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, *port2 = 1; } -static void mlx5_modify_lag(struct mlx5_lag *ldev, - struct lag_tracker *tracker) +void mlx5_modify_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker) { struct mlx5_core_dev *dev0 = ldev->pf[0].dev; u8 v2p_port1, v2p_port2; @@ -241,9 +202,9 @@ static int mlx5_create_lag(struct mlx5_lag *ldev, return err; } -static int mlx5_activate_lag(struct mlx5_lag *ldev, - struct lag_tracker *tracker, - u8 flags) +int mlx5_activate_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + u8 flags) { bool roce_lag = !!(flags & MLX5_LAG_FLAG_ROCE); struct mlx5_core_dev *dev0 = ldev->pf[0].dev; @@ -386,7 +347,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) { - schedule_delayed_work(&ldev->bond_work, delay); + queue_delayed_work(ldev->wq, &ldev->bond_work, delay); } static void mlx5_do_bond_work(struct work_struct *work) @@ -538,6 +499,12 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(void) if (!ldev) return NULL; + ldev->wq = create_singlethread_workqueue("mlx5_lag"); + if (!ldev->wq) { + kfree(ldev); + return NULL; + } + INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); return ldev; @@ -545,6 +512,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(void) static void mlx5_lag_dev_free(struct mlx5_lag *ldev) { + destroy_workqueue(ldev->wq); kfree(ldev); } @@ -592,6 +560,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) { struct mlx5_lag *ldev = NULL; struct mlx5_core_dev *tmp_dev; + int err; if (!MLX5_CAP_GEN(dev, vport_group_manager) || !MLX5_CAP_GEN(dev, lag_master) || @@ -619,6 +588,11 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); } } + + err = mlx5_lag_mp_init(ldev); + if (err) + mlx5_core_err(dev, "Failed to init multipath lag err=%d\n", + err); } int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num) @@ -664,6 +638,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev) if (i == MLX5_MAX_PORTS) { if (ldev->nb.notifier_call) unregister_netdevice_notifier(&ldev->nb); + mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); mlx5_lag_dev_free(ldev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag.h new file mode 100644 index 000000000000..1dea0b1c9826 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_LAG_H__ +#define __MLX5_LAG_H__ + +#include "mlx5_core.h" +#include "lag_mp.h" + +enum { + MLX5_LAG_FLAG_ROCE = 1 << 0, + MLX5_LAG_FLAG_SRIOV = 1 << 1, + MLX5_LAG_FLAG_MULTIPATH = 1 << 2, +}; + +#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV |\ + MLX5_LAG_FLAG_MULTIPATH) + +struct lag_func { + struct mlx5_core_dev *dev; + struct net_device *netdev; +}; + +/* Used for collection of netdev event info. */ +struct lag_tracker { + enum netdev_lag_tx_type tx_type; + struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; + unsigned int is_bonded:1; +}; + +/* LAG data of a ConnectX card. + * It serves both its phys functions. + */ +struct mlx5_lag { + u8 flags; + u8 v2p_map[MLX5_MAX_PORTS]; + struct lag_func pf[MLX5_MAX_PORTS]; + struct lag_tracker tracker; + struct workqueue_struct *wq; + struct delayed_work bond_work; + struct notifier_block nb; + struct lag_mp lag_mp; +}; + +static inline struct mlx5_lag * +mlx5_lag_dev_get(struct mlx5_core_dev *dev) +{ + return dev->priv.lag; +} + +static inline bool +__mlx5_lag_is_active(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_MODE_FLAGS); +} + +void mlx5_modify_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker); +int mlx5_activate_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + u8 flags); +int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, + struct net_device *ndev); + +#endif /* __MLX5_LAG_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c new file mode 100644 index 000000000000..5633f8572800 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include <linux/netdevice.h> +#include "lag.h" +#include "lag_mp.h" +#include "mlx5_core.h" +#include "eswitch.h" +#include "lib/mlx5.h" + +static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) +{ + if (!ldev->pf[0].dev || !ldev->pf[1].dev) + return false; + + return mlx5_esw_multipath_prereq(ldev->pf[0].dev, ldev->pf[1].dev); +} + +static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); +} + +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + bool res; + + ldev = mlx5_lag_dev_get(dev); + res = ldev && __mlx5_lag_is_multipath(ldev); + + return res; +} + +/** + * Set lag port affinity + * + * @ldev: lag device + * @port: + * 0 - set normal affinity. + * 1 - set affinity to port 1. + * 2 - set affinity to port 2. + * + **/ +static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, int port) +{ + struct lag_tracker tracker; + + if (!__mlx5_lag_is_multipath(ldev)) + return; + + switch (port) { + case 0: + tracker.netdev_state[0].tx_enabled = true; + tracker.netdev_state[1].tx_enabled = true; + tracker.netdev_state[0].link_up = true; + tracker.netdev_state[1].link_up = true; + break; + case 1: + tracker.netdev_state[0].tx_enabled = true; + tracker.netdev_state[0].link_up = true; + tracker.netdev_state[1].tx_enabled = false; + tracker.netdev_state[1].link_up = false; + break; + case 2: + tracker.netdev_state[0].tx_enabled = false; + tracker.netdev_state[0].link_up = false; + tracker.netdev_state[1].tx_enabled = true; + tracker.netdev_state[1].link_up = true; + break; + default: + mlx5_core_warn(ldev->pf[0].dev, "Invalid affinity port %d", + port); + return; + } + + if (tracker.netdev_state[0].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[0].dev->priv.events, + MLX5_DEV_EVENT_PORT_AFFINITY, + (void *)0); + + if (tracker.netdev_state[1].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[1].dev->priv.events, + MLX5_DEV_EVENT_PORT_AFFINITY, + (void *)0); + + mlx5_modify_lag(ldev, &tracker); +} + +static void mlx5_lag_fib_event_flush(struct notifier_block *nb) +{ + struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb); + struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp); + + flush_workqueue(ldev->wq); +} + +struct mlx5_fib_event_work { + struct work_struct work; + struct mlx5_lag *ldev; + unsigned long event; + union { + struct fib_entry_notifier_info fen_info; + struct fib_nh_notifier_info fnh_info; + }; +}; + +static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, + unsigned long event, + struct fib_info *fi) +{ + struct lag_mp *mp = &ldev->lag_mp; + + /* Handle delete event */ + if (event == FIB_EVENT_ENTRY_DEL) { + /* stop track */ + if (mp->mfi == fi) + mp->mfi = NULL; + return; + } + + /* Handle add/replace event */ + if (fi->fib_nhs == 1) { + if (__mlx5_lag_is_active(ldev)) { + struct net_device *nh_dev = fi->fib_nh[0].nh_dev; + int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); + + mlx5_lag_set_port_affinity(ldev, ++i); + } + return; + } + + if (fi->fib_nhs != 2) + return; + + /* Verify next hops are ports of the same hca */ + if (!(fi->fib_nh[0].nh_dev == ldev->pf[0].netdev && + fi->fib_nh[1].nh_dev == ldev->pf[1].netdev) && + !(fi->fib_nh[0].nh_dev == ldev->pf[1].netdev && + fi->fib_nh[1].nh_dev == ldev->pf[0].netdev)) { + mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n"); + return; + } + + /* First time we see multipath route */ + if (!mp->mfi && !__mlx5_lag_is_active(ldev)) { + struct lag_tracker tracker; + + tracker = ldev->tracker; + mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH); + } + + mlx5_lag_set_port_affinity(ldev, 0); + mp->mfi = fi; +} + +static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, + unsigned long event, + struct fib_nh *fib_nh, + struct fib_info *fi) +{ + struct lag_mp *mp = &ldev->lag_mp; + + /* Check the nh event is related to the route */ + if (!mp->mfi || mp->mfi != fi) + return; + + /* nh added/removed */ + if (event == FIB_EVENT_NH_DEL) { + int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev); + + if (i >= 0) { + i = (i + 1) % 2 + 1; /* peer port */ + mlx5_lag_set_port_affinity(ldev, i); + } + } else if (event == FIB_EVENT_NH_ADD && + fi->fib_nhs == 2) { + mlx5_lag_set_port_affinity(ldev, 0); + } +} + +static void mlx5_lag_fib_update(struct work_struct *work) +{ + struct mlx5_fib_event_work *fib_work = + container_of(work, struct mlx5_fib_event_work, work); + struct mlx5_lag *ldev = fib_work->ldev; + struct fib_nh *fib_nh; + + /* Protect internal structures from changes */ + rtnl_lock(); + switch (fib_work->event) { + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_APPEND: /* fall through */ + case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + mlx5_lag_fib_route_event(ldev, fib_work->event, + fib_work->fen_info.fi); + fib_info_put(fib_work->fen_info.fi); + break; + case FIB_EVENT_NH_ADD: /* fall through */ + case FIB_EVENT_NH_DEL: + fib_nh = fib_work->fnh_info.fib_nh; + mlx5_lag_fib_nexthop_event(ldev, + fib_work->event, + fib_work->fnh_info.fib_nh, + fib_nh->nh_parent); + fib_info_put(fib_work->fnh_info.fib_nh->nh_parent); + break; + } + + rtnl_unlock(); + kfree(fib_work); +} + +static struct mlx5_fib_event_work * +mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event) +{ + struct mlx5_fib_event_work *fib_work; + + fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); + if (WARN_ON(!fib_work)) + return NULL; + + INIT_WORK(&fib_work->work, mlx5_lag_fib_update); + fib_work->ldev = ldev; + fib_work->event = event; + + return fib_work; +} + +static int mlx5_lag_fib_event(struct notifier_block *nb, + unsigned long event, + void *ptr) +{ + struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb); + struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp); + struct fib_notifier_info *info = ptr; + struct mlx5_fib_event_work *fib_work; + struct fib_entry_notifier_info *fen_info; + struct fib_nh_notifier_info *fnh_info; + struct fib_info *fi; + + if (info->family != AF_INET) + return NOTIFY_DONE; + + if (!mlx5_lag_multipath_check_prereq(ldev)) + return NOTIFY_DONE; + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_APPEND: /* fall through */ + case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + fen_info = container_of(info, struct fib_entry_notifier_info, + info); + fi = fen_info->fi; + if (fi->fib_dev != ldev->pf[0].netdev && + fi->fib_dev != ldev->pf[1].netdev) { + return NOTIFY_DONE; + } + fib_work = mlx5_lag_init_fib_work(ldev, event); + if (!fib_work) + return NOTIFY_DONE; + fib_work->fen_info = *fen_info; + /* Take reference on fib_info to prevent it from being + * freed while work is queued. Release it afterwards. + */ + fib_info_hold(fib_work->fen_info.fi); + break; + case FIB_EVENT_NH_ADD: /* fall through */ + case FIB_EVENT_NH_DEL: + fnh_info = container_of(info, struct fib_nh_notifier_info, + info); + fib_work = mlx5_lag_init_fib_work(ldev, event); + if (!fib_work) + return NOTIFY_DONE; + fib_work->fnh_info = *fnh_info; + fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent); + break; + default: + return NOTIFY_DONE; + } + + queue_work(ldev->wq, &fib_work->work); + + return NOTIFY_DONE; +} + +int mlx5_lag_mp_init(struct mlx5_lag *ldev) +{ + struct lag_mp *mp = &ldev->lag_mp; + int err; + + if (mp->fib_nb.notifier_call) + return 0; + + mp->fib_nb.notifier_call = mlx5_lag_fib_event; + err = register_fib_notifier(&mp->fib_nb, + mlx5_lag_fib_event_flush); + if (err) + mp->fib_nb.notifier_call = NULL; + + return err; +} + +void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) +{ + struct lag_mp *mp = &ldev->lag_mp; + + if (!mp->fib_nb.notifier_call) + return; + + unregister_fib_notifier(&mp->fib_nb); + mp->fib_nb.notifier_call = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h new file mode 100644 index 000000000000..6d14b1100be9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_LAG_MP_H__ +#define __MLX5_LAG_MP_H__ + +#include "lag.h" +#include "mlx5_core.h" + +struct lag_mp { + struct notifier_block fib_nb; + struct fib_info *mfi; /* used in tracking fib events */ +}; + +#ifdef CONFIG_MLX5_ESWITCH + +int mlx5_lag_mp_init(struct mlx5_lag *ldev); +void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev); + +#else /* CONFIG_MLX5_ESWITCH */ + +static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; } +static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {} + +#endif /* CONFIG_MLX5_ESWITCH */ +#endif /* __MLX5_LAG_MP_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 40d591c8e76c..8391dde869a7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1486,6 +1486,8 @@ static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF}, /* ConnectX-5 Ex VF */ { PCI_VDEVICE(MELLANOX, 0x101b) }, /* ConnectX-6 */ { PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF}, /* ConnectX-6 VF */ + { PCI_VDEVICE(MELLANOX, 0x101d) }, /* ConnectX-6 Dx */ + { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ { 0, } |