diff options
72 files changed, 649 insertions, 429 deletions
diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 129f413ea349..3abc576ff797 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -61,22 +61,6 @@ attribute-sets: nested-attributes: bitset-bits - - name: u64-array - attributes: - - - name: u64 - type: nest - multi-attr: true - nested-attributes: u64 - - - name: s32-array - attributes: - - - name: s32 - type: nest - multi-attr: true - nested-attributes: s32 - - name: string attributes: - @@ -705,16 +689,16 @@ attribute-sets: type: u8 - name: corrected - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: uncorr - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: corr-bits - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: fec attributes: @@ -827,8 +811,8 @@ attribute-sets: type: u32 - name: index - type: nest - nested-attributes: s32-array + type: binary + sub-type: s32 - name: module attributes: diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst index 3a7a714cc08f..3354ca3608ee 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -40,6 +40,7 @@ flow_steering_mode: Device flow steering mode --------------------------------------------- The flow steering mode parameter controls the flow steering mode of the driver. Two modes are supported: + 1. 'dmfs' - Device managed flow steering. 2. 'smfs' - Software/Driver managed flow steering. @@ -99,6 +100,7 @@ between representors and stacked devices. By default metadata is enabled on the supported devices in E-switch. Metadata is applicable only for E-switch in switchdev mode and users may disable it when NONE of the below use cases will be in use: + 1. HCA is in Dual/multi-port RoCE mode. 2. VF/SF representor bonding (Usually used for Live migration) 3. Stacked devices @@ -180,7 +182,8 @@ User commands examples: $ devlink health diagnose pci/0000:82:00.0 reporter tx -NOTE: This command has valid output only when interface is up, otherwise the command has empty output. +.. note:: + This command has valid output only when interface is up, otherwise the command has empty output. - Show number of tx errors indicated, number of recover flows ended successfully, is autorecover enabled and graceful period from last recover:: @@ -232,8 +235,9 @@ User commands examples: $ devlink health dump show pci/0000:82:00.0 reporter fw -NOTE: This command can run only on the PF which has fw tracer ownership, -running it on other PF or any VF will return "Operation not permitted". +.. note:: + This command can run only on the PF which has fw tracer ownership, + running it on other PF or any VF will return "Operation not permitted". fw fatal reporter ----------------- @@ -256,7 +260,8 @@ User commands examples: $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal -NOTE: This command can run only on PF. +.. note:: + This command can run only on PF. vnic reporter ------------- @@ -265,28 +270,37 @@ It is responsible for querying the vnic diagnostic counters from fw and displayi them in realtime. Description of the vnic counters: -total_q_under_processor_handle: number of queues in an error state due to -an async error or errored command. -send_queue_priority_update_flow: number of QP/SQ priority/SL update -events. -cq_overrun: number of times CQ entered an error state due to an -overflow. -async_eq_overrun: number of times an EQ mapped to async events was -overrun. -comp_eq_overrun: number of times an EQ mapped to completion events was -overrun. -quota_exceeded_command: number of commands issued and failed due to quota -exceeded. -invalid_command: number of commands issued and failed dues to any reason -other than quota exceeded. -nic_receive_steering_discard: number of packets that completed RX flow -steering but were discarded due to a mismatch in flow table. + +- total_q_under_processor_handle + number of queues in an error state due to + an async error or errored command. +- send_queue_priority_update_flow + number of QP/SQ priority/SL update events. +- cq_overrun + number of times CQ entered an error state due to an overflow. +- async_eq_overrun + number of times an EQ mapped to async events was overrun. + comp_eq_overrun number of times an EQ mapped to completion events was + overrun. +- quota_exceeded_command + number of commands issued and failed due to quota exceeded. +- invalid_command + number of commands issued and failed dues to any reason other than quota + exceeded. +- nic_receive_steering_discard + number of packets that completed RX flow + steering but were discarded due to a mismatch in flow table. User commands examples: -- Diagnose PF/VF vnic counters + +- Diagnose PF/VF vnic counters:: + $ devlink health diagnose pci/0000:82:00.1 reporter vnic + - Diagnose representor vnic counters (performed by supplying devlink port of the - representor, which can be obtained via devlink port command) + representor, which can be obtained via devlink port command):: + $ devlink health diagnose pci/0000:82:00.1/65537 reporter vnic -NOTE: This command can run over all interfaces such as PF/VF and representor ports. +.. note:: + This command can run over all interfaces such as PF/VF and representor ports. diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 64a2f2f83735..08a46ffd53af 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -7170,7 +7170,7 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) goto out; } if (chip->reset) - usleep_range(1000, 2000); + usleep_range(10000, 20000); /* Detect if the device is configured in single chip addressing mode, * otherwise continue with address specific smi init/detection. diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 33a9574e9e04..32d2c6fac652 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1329,7 +1329,7 @@ static enum xgbe_mode xgbe_phy_status_aneg(struct xgbe_prv_data *pdata) return pdata->phy_if.phy_impl.an_outcome(pdata); } -static void xgbe_phy_status_result(struct xgbe_prv_data *pdata) +static bool xgbe_phy_status_result(struct xgbe_prv_data *pdata) { struct ethtool_link_ksettings *lks = &pdata->phy.lks; enum xgbe_mode mode; @@ -1367,8 +1367,13 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata) pdata->phy.duplex = DUPLEX_FULL; - if (xgbe_set_mode(pdata, mode) && pdata->an_again) + if (!xgbe_set_mode(pdata, mode)) + return false; + + if (pdata->an_again) xgbe_phy_reconfig_aneg(pdata); + + return true; } static void xgbe_phy_status(struct xgbe_prv_data *pdata) @@ -1398,7 +1403,8 @@ static void xgbe_phy_status(struct xgbe_prv_data *pdata) return; } - xgbe_phy_status_result(pdata); + if (xgbe_phy_status_result(pdata)) + return; if (test_bit(XGBE_LINK_INIT, &pdata->dev_state)) clear_bit(XGBE_LINK_INIT, &pdata->dev_state); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 059bd911c51d..52d0a126eb61 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1152,11 +1152,11 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int offset = rx_ring->rx_offset; struct xdp_buff *xdp = &rx_ring->xdp; + u32 cached_ntc = rx_ring->first_desc; struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; u32 cnt = rx_ring->count; - u32 cached_ntc = ntc; u32 xdp_xmit = 0; u32 cached_ntu; bool failure; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index f40497823e65..7c0f2adbea00 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -490,7 +490,7 @@ static void poll_trace(struct mlx5_fw_tracer *tracer, (u64)timestamp_low; break; default: - if (tracer_event->event_id >= tracer->str_db.first_string_trace || + if (tracer_event->event_id >= tracer->str_db.first_string_trace && tracer_event->event_id <= tracer->str_db.first_string_trace + tracer->str_db.num_string_trace) { tracer_event->type = TRACER_EVENT_TYPE_STRING; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index b8987a404d75..8e999f238194 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -327,6 +327,7 @@ struct mlx5e_params { unsigned int sw_mtu; int hard_mtu; bool ptp_rx; + __be32 terminate_lkey_be; }; static inline u8 mlx5e_get_dcb_num_tc(struct mlx5e_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 7ac1ad9c46de..7e8e96cc5cd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -51,7 +51,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, if (err) goto out; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]); port_buffer->buffer[i].lossy = MLX5_GET(bufferx_reg, buffer, lossy); @@ -73,14 +73,24 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, port_buffer->buffer[i].lossy); } - port_buffer->headroom_size = total_used; + port_buffer->internal_buffers_size = 0; + for (i = MLX5E_MAX_NETWORK_BUFFER; i < MLX5E_TOTAL_BUFFERS; i++) { + buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]); + port_buffer->internal_buffers_size += + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; + } + port_buffer->port_buffer_size = MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; - port_buffer->spare_buffer_size = - port_buffer->port_buffer_size - total_used; - - mlx5e_dbg(HW, priv, "total buffer size=%d, spare buffer size=%d\n", - port_buffer->port_buffer_size, + port_buffer->headroom_size = total_used; + port_buffer->spare_buffer_size = port_buffer->port_buffer_size - + port_buffer->internal_buffers_size - + port_buffer->headroom_size; + + mlx5e_dbg(HW, priv, + "total buffer size=%u, headroom buffer size=%u, internal buffers size=%u, spare buffer size=%u\n", + port_buffer->port_buffer_size, port_buffer->headroom_size, + port_buffer->internal_buffers_size, port_buffer->spare_buffer_size); out: kfree(out); @@ -206,11 +216,11 @@ static int port_update_pool_cfg(struct mlx5_core_dev *mdev, if (!MLX5_CAP_GEN(mdev, sbcam_reg)) return 0; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) lossless_buff_count += ((port_buffer->buffer[i].size) && (!(port_buffer->buffer[i].lossy))); - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { p = select_sbcm_params(&port_buffer->buffer[i], lossless_buff_count); err = mlx5e_port_set_sbcm(mdev, 0, i, MLX5_INGRESS_DIR, @@ -293,7 +303,7 @@ static int port_set_buffer(struct mlx5e_priv *priv, if (err) goto out; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; @@ -351,7 +361,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, { int i; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { if (port_buffer->buffer[i].lossy) { port_buffer->buffer[i].xoff = 0; port_buffer->buffer[i].xon = 0; @@ -408,7 +418,7 @@ static int update_buffer_lossy(struct mlx5_core_dev *mdev, int err; int i; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { prio_count = 0; lossy_count = 0; @@ -432,11 +442,11 @@ static int update_buffer_lossy(struct mlx5_core_dev *mdev, } if (changed) { - err = port_update_pool_cfg(mdev, port_buffer); + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; - err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); + err = port_update_pool_cfg(mdev, port_buffer); if (err) return err; @@ -515,7 +525,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) { update_prio2buffer = true; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) mlx5e_dbg(HW, priv, "%s: requested to map prio[%d] to buffer %d\n", __func__, i, prio2buffer[i]); @@ -530,7 +540,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, } if (change & MLX5E_PORT_BUFFER_SIZE) { - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { mlx5e_dbg(HW, priv, "%s: buffer[%d]=%d\n", __func__, i, buffer_size[i]); if (!port_buffer.buffer[i].lossy && !buffer_size[i]) { mlx5e_dbg(HW, priv, "%s: lossless buffer[%d] size cannot be zero\n", @@ -544,7 +554,9 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, mlx5e_dbg(HW, priv, "%s: total buffer requested=%d\n", __func__, total_used); - if (total_used > port_buffer.port_buffer_size) + if (total_used > port_buffer.headroom_size && + (total_used - port_buffer.headroom_size) > + port_buffer.spare_buffer_size) return -EINVAL; update_buffer = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index a6ef118de758..f4a19ffbb641 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -35,7 +35,8 @@ #include "en.h" #include "port.h" -#define MLX5E_MAX_BUFFER 8 +#define MLX5E_MAX_NETWORK_BUFFER 8 +#define MLX5E_TOTAL_BUFFERS 10 #define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ #define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ @@ -60,8 +61,9 @@ struct mlx5e_bufferx_reg { struct mlx5e_port_buffer { u32 port_buffer_size; u32 spare_buffer_size; - u32 headroom_size; - struct mlx5e_bufferx_reg buffer[MLX5E_MAX_BUFFER]; + u32 headroom_size; /* Buffers 0-7 */ + u32 internal_buffers_size; /* Buffers 8-9 */ + struct mlx5e_bufferx_reg buffer[MLX5E_MAX_NETWORK_BUFFER]; }; int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c index fc923a99b6a4..0380a04c3691 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c @@ -84,7 +84,7 @@ mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, int mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, - struct flow_action *flow_action, + struct flow_action *flow_action, int from, int to, struct mlx5_flow_attr *attr, enum mlx5_flow_namespace_type ns_type) { @@ -96,6 +96,11 @@ mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, priv = parse_state->flow->priv; flow_action_for_each(i, act, flow_action) { + if (i < from) + continue; + else if (i > to) + break; + tc_act = mlx5e_tc_act_get(act->id, ns_type); if (!tc_act || !tc_act->post_parse) continue; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index 0e6e1872ac62..d6c12d0ea55b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -112,7 +112,7 @@ mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, int mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, - struct flow_action *flow_action, + struct flow_action *flow_action, int from, int to, struct mlx5_flow_attr *attr, enum mlx5_flow_namespace_type ns_type); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 6a052c6cfc15..f0c3464f037f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -492,6 +492,19 @@ void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) mlx5e_encap_dealloc(priv, e); } +static void mlx5e_encap_put_locked(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + lockdep_assert_held(&esw->offloads.encap_tbl_lock); + + if (!refcount_dec_and_test(&e->refcnt)) + return; + list_del(&e->route_list); + hash_del_rcu(&e->encap_hlist); + mlx5e_encap_dealloc(priv, e); +} + static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; @@ -816,6 +829,8 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, uintptr_t hash_key; int err = 0; + lockdep_assert_held(&esw->offloads.encap_tbl_lock); + parse_attr = attr->parse_attr; tun_info = parse_attr->tun_info[out_index]; mpls_info = &parse_attr->mpls_info[out_index]; @@ -829,7 +844,6 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, hash_key = hash_encap_info(&key); - mutex_lock(&esw->offloads.encap_tbl_lock); e = mlx5e_encap_get(priv, &key, hash_key); /* must verify if encap is valid or not */ @@ -840,15 +854,6 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, goto out_err; } - mutex_unlock(&esw->offloads.encap_tbl_lock); - wait_for_completion(&e->res_ready); - - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); - if (e->compl_result < 0) { - err = -EREMOTEIO; - goto out_err; - } goto attach_flow; } @@ -877,15 +882,12 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, INIT_LIST_HEAD(&e->flows); hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key); tbl_time_before = mlx5e_route_tbl_get_last_update(priv); - mutex_unlock(&esw->offloads.encap_tbl_lock); if (family == AF_INET) err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e); else if (family == AF_INET6) err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e); - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); complete_all(&e->res_ready); if (err) { e->compl_result = err; @@ -920,18 +922,15 @@ attach_flow: } else { flow_flag_set(flow, SLOW); } - mutex_unlock(&esw->offloads.encap_tbl_lock); return err; out_err: - mutex_unlock(&esw->offloads.encap_tbl_lock); if (e) - mlx5e_encap_put(priv, e); + mlx5e_encap_put_locked(priv, e); return err; out_err_init: - mutex_unlock(&esw->offloads.encap_tbl_lock); kfree(tun_info); kfree(e); return err; @@ -1016,6 +1015,93 @@ out_err: return err; } +int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack, + bool *vf_tun) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_esw_flow_attr *esw_attr; + struct net_device *encap_dev = NULL; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *out_priv; + struct mlx5_eswitch *esw; + int out_index; + int err = 0; + + if (!mlx5e_is_eswitch_flow(flow)) + return 0; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + *vf_tun = false; + + esw = priv->mdev->priv.eswitch; + mutex_lock(&esw->offloads.encap_tbl_lock); + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + struct net_device *out_dev; + int mirred_ifindex; + + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + mirred_ifindex = parse_attr->mirred_ifindex[out_index]; + out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); + if (!out_dev) { + NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); + err = -ENODEV; + goto out; + } + err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, + extack, &encap_dev); + dev_put(out_dev); + if (err) + goto out; + + if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && + !esw_attr->dest_int_port) + *vf_tun = true; + + out_priv = netdev_priv(encap_dev); + rpriv = out_priv->ppriv; + esw_attr->dests[out_index].rep = rpriv->rep; + esw_attr->dests[out_index].mdev = out_priv->mdev; + } + + if (*vf_tun && esw_attr->out_count > 1) { + NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); + err = -EOPNOTSUPP; + goto out; + } + +out: + mutex_unlock(&esw->offloads.encap_tbl_lock); + return err; +} + +void mlx5e_tc_tun_encap_dests_unset(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr; + int out_index; + + if (!mlx5e_is_eswitch_flow(flow)) + return; + + esw_attr = attr->esw_attr; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + mlx5e_detach_encap(flow->priv, flow, attr, out_index); + kfree(attr->parse_attr->tun_info[out_index]); + } +} + static int cmp_route_info(struct mlx5e_route_key *a, struct mlx5e_route_key *b) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h index 8ad273dde40e..5d7d67687cbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h @@ -30,6 +30,15 @@ int mlx5e_attach_decap_route(struct mlx5e_priv *priv, void mlx5e_detach_decap_route(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow); +int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack, + bool *vf_tun); +void mlx5e_tc_tun_encap_dests_unset(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr); + struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info); int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 1f90594499c6..41c396e76457 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -150,10 +150,8 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, inlen = MLX5_ST_SZ_BYTES(modify_tir_in); in = kvzalloc(inlen, GFP_KERNEL); - if (!in) { - err = -ENOMEM; - goto out; - } + if (!in) + return -ENOMEM; if (enable_uc_lb) lb_flags = MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; @@ -171,14 +169,13 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, tirn = tir->tirn; err = mlx5_core_modify_tir(mdev, tirn, in); if (err) - goto out; + break; } + mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); -out: kvfree(in); if (err) netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); - mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 89de92d06483..ebee52a8361a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -926,9 +926,10 @@ static int mlx5e_dcbnl_getbuffer(struct net_device *dev, if (err) return err; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) dcb_buffer->buffer_size[i] = port_buffer.buffer[i].size; - dcb_buffer->total_size = port_buffer.port_buffer_size; + dcb_buffer->total_size = port_buffer.port_buffer_size - + port_buffer.internal_buffers_size; return 0; } @@ -970,7 +971,7 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (err) return err; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { if (port_buffer.buffer[i].size != dcb_buffer->buffer_size[i]) { changed |= MLX5E_PORT_BUFFER_SIZE; buffer_size = dcb_buffer->buffer_size; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2944691f06ad..a7c526ee5024 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -727,26 +727,6 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq) mlx5e_rq_shampo_hd_free(rq); } -static __be32 mlx5e_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev) -{ - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; - int res; - - if (!MLX5_CAP_GEN(dev, terminate_scatter_list_mkey)) - return MLX5_TERMINATE_SCATTER_LIST_LKEY; - - MLX5_SET(query_special_contexts_in, in, opcode, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - res = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (res) - return MLX5_TERMINATE_SCATTER_LIST_LKEY; - - res = MLX5_GET(query_special_contexts_out, out, - terminate_scatter_list_mkey); - return cpu_to_be32(res); -} - static int mlx5e_alloc_rq(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, struct mlx5e_rq_param *rqp, @@ -908,7 +888,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, /* check if num_frags is not a pow of two */ if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { wqe->data[f].byte_count = 0; - wqe->data[f].lkey = mlx5e_get_terminate_scatter_list_mkey(mdev); + wqe->data[f].lkey = params->terminate_lkey_be; wqe->data[f].addr = 0; } } @@ -5007,6 +4987,8 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 /* RQ */ mlx5e_build_rq_params(mdev, params); + params->terminate_lkey_be = mlx5_core_get_terminate_scatter_list_mkey(mdev); + params->packet_merge.timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT); /* CQ moderation params */ @@ -5279,12 +5261,16 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, mlx5e_timestamp_init(priv); + priv->dfs_root = debugfs_create_dir("nic", + mlx5_debugfs_get_dev_root(mdev)); + fs = mlx5e_fs_init(priv->profile, mdev, !test_bit(MLX5E_STATE_DESTROYING, &priv->state), priv->dfs_root); if (!fs) { err = -ENOMEM; mlx5_core_err(mdev, "FS initialization failed, %d\n", err); + debugfs_remove_recursive(priv->dfs_root); return err; } priv->fs = fs; @@ -5305,6 +5291,7 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) mlx5e_health_destroy_reporters(priv); mlx5e_ktls_cleanup(priv); mlx5e_fs_cleanup(priv->fs); + debugfs_remove_recursive(priv->dfs_root); priv->fs = NULL; } @@ -5851,8 +5838,8 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv) } static int -mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, - const struct mlx5e_profile *new_profile, void *new_ppriv) +mlx5e_netdev_init_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) { struct mlx5e_priv *priv = netdev_priv(netdev); int err; @@ -5868,6 +5855,25 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde err = new_profile->init(priv->mdev, priv->netdev); if (err) goto priv_cleanup; + + return 0; + +priv_cleanup: + mlx5e_priv_cleanup(priv); + return err; +} + +static int +mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + err = mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + if (err) + return err; + err = mlx5e_attach_netdev(priv); if (err) goto profile_cleanup; @@ -5875,7 +5881,6 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde profile_cleanup: new_profile->cleanup(priv); -priv_cleanup: mlx5e_priv_cleanup(priv); return err; } @@ -5894,6 +5899,12 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, priv->profile->cleanup(priv); mlx5e_priv_cleanup(priv); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + set_bit(MLX5E_STATE_DESTROYING, &priv->state); + return -EIO; + } + err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv); if (err) { /* roll back to original profile */ netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err); @@ -5955,8 +5966,11 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state) struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; - if (!netif_device_present(netdev)) + if (!netif_device_present(netdev)) { + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5e_destroy_mdev_resources(mdev); return -ENODEV; + } mlx5e_detach_netdev(priv); mlx5e_destroy_mdev_resources(mdev); @@ -6002,9 +6016,6 @@ static int mlx5e_probe(struct auxiliary_device *adev, priv->profile = profile; priv->ppriv = NULL; - priv->dfs_root = debugfs_create_dir("nic", - mlx5_debugfs_get_dev_root(priv->mdev)); - err = profile->init(mdev, netdev); if (err) { mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err); @@ -6033,7 +6044,6 @@ err_resume: err_profile_cleanup: profile->cleanup(priv); err_destroy_netdev: - debugfs_remove_recursive(priv->dfs_root); mlx5e_destroy_netdev(priv); err_devlink_port_unregister: mlx5e_devlink_port_unregister(mlx5e_dev); @@ -6053,7 +6063,6 @@ static void mlx5e_remove(struct auxiliary_device *adev) unregister_netdev(priv->netdev); mlx5e_suspend(adev, state); priv->profile->cleanup(priv); - debugfs_remove_recursive(priv->dfs_root); mlx5e_destroy_netdev(priv); mlx5e_devlink_port_unregister(mlx5e_dev); mlx5e_destroy_devlink(mlx5e_dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 1fc386eccaf8..3e7041bd5705 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include <linux/debugfs.h> #include <linux/mlx5/fs.h> #include <net/switchdev.h> #include <net/pkt_cls.h> @@ -812,11 +813,15 @@ static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, { struct mlx5e_priv *priv = netdev_priv(netdev); + priv->dfs_root = debugfs_create_dir("nic", + mlx5_debugfs_get_dev_root(mdev)); + priv->fs = mlx5e_fs_init(priv->profile, mdev, !test_bit(MLX5E_STATE_DESTROYING, &priv->state), priv->dfs_root); if (!priv->fs) { netdev_err(priv->netdev, "FS allocation failed\n"); + debugfs_remove_recursive(priv->dfs_root); return -ENOMEM; } @@ -829,6 +834,7 @@ static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, static void mlx5e_cleanup_rep(struct mlx5e_priv *priv) { mlx5e_fs_cleanup(priv->fs); + debugfs_remove_recursive(priv->dfs_root); priv->fs = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e95414ef1f04..8a5a8703f0a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1700,91 +1700,6 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro } static int -set_encap_dests(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct mlx5_flow_attr *attr, - struct netlink_ext_ack *extack, - bool *vf_tun) -{ - struct mlx5e_tc_flow_parse_attr *parse_attr; - struct mlx5_esw_flow_attr *esw_attr; - struct net_device *encap_dev = NULL; - struct mlx5e_rep_priv *rpriv; - struct mlx5e_priv *out_priv; - int out_index; - int err = 0; - - if (!mlx5e_is_eswitch_flow(flow)) - return 0; - - parse_attr = attr->parse_attr; - esw_attr = attr->esw_attr; - *vf_tun = false; - - for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { - struct net_device *out_dev; - int mirred_ifindex; - - if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) - continue; - - mirred_ifindex = parse_attr->mirred_ifindex[out_index]; - out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); - if (!out_dev) { - NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); - err = -ENODEV; - goto out; - } - err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, - extack, &encap_dev); - dev_put(out_dev); - if (err) - goto out; - - if (esw_attr->dests[out_index].flags & - MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && - !esw_attr->dest_int_port) - *vf_tun = true; - - out_priv = netdev_priv(encap_dev); - rpriv = out_priv->ppriv; - esw_attr->dests[out_index].rep = rpriv->rep; - esw_attr->dests[out_index].mdev = out_priv->mdev; - } - - if (*vf_tun && esw_attr->out_count > 1) { - NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); - err = -EOPNOTSUPP; - goto out; - } - -out: - return err; -} - -static void -clean_encap_dests(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct mlx5_flow_attr *attr) -{ - struct mlx5_esw_flow_attr *esw_attr; - int out_index; - - if (!mlx5e_is_eswitch_flow(flow)) - return; - - esw_attr = attr->esw_attr; - - for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { - if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) - continue; - - mlx5e_detach_encap(priv, flow, attr, out_index); - kfree(attr->parse_attr->tun_info[out_index]); - } -} - -static int verify_attr_actions(u32 actions, struct netlink_ext_ack *extack) { if (!(actions & @@ -1820,7 +1735,7 @@ post_process_attr(struct mlx5e_tc_flow *flow, if (err) goto err_out; - err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun); + err = mlx5e_tc_tun_encap_dests_set(flow->priv, flow, attr, extack, &vf_tun); if (err) goto err_out; @@ -3944,8 +3859,8 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, struct mlx5_flow_attr *prev_attr; struct flow_action_entry *act; struct mlx5e_tc_act *tc_act; + int err, i, i_split = 0; bool is_missable; - int err, i; ns_type = mlx5e_get_flow_namespace(flow); list_add(&attr->list, &flow->attrs); @@ -3986,7 +3901,8 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, i < flow_action->num_entries - 1)) { is_missable = tc_act->is_missable ? tc_act->is_missable(act) : false; - err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + err = mlx5e_tc_act_post_parse(parse_state, flow_action, i_split, i, attr, + ns_type); if (err) goto out_free_post_acts; @@ -3996,6 +3912,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, goto out_free_post_acts; } + i_split = i + 1; list_add(&attr->list, &flow->attrs); } @@ -4010,7 +3927,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, } } - err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + err = mlx5e_tc_act_post_parse(parse_state, flow_action, i_split, i, attr, ns_type); if (err) goto out_free_post_acts; @@ -4324,7 +4241,7 @@ mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *a if (attr->post_act_handle) mlx5e_tc_post_act_del(get_post_action(flow->priv), attr->post_act_handle); - clean_encap_dests(flow->priv, flow, attr); + mlx5e_tc_tun_encap_dests_unset(flow->priv, flow, attr); if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) mlx5_fc_destroy(counter_dev, attr->counter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index fe698c79616c..3db4866d7880 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -824,7 +824,7 @@ static int comp_irqs_request_pci(struct mlx5_core_dev *dev) ncomp_eqs = table->num_comp_eqs; cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL); if (!cpus) - ret = -ENOMEM; + return -ENOMEM; i = 0; rcu_read_lock(); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a7eb65cd0bdd..d6ee016deae1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -923,7 +923,6 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, } mlx5_pci_vsc_init(dev); - dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); return 0; err_clr_master: @@ -1155,6 +1154,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout goto err_cmd_cleanup; } + dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); mlx5_start_health_poll(dev); @@ -1802,15 +1802,16 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); - /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain - * fw_reset before unregistering the devlink. + /* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using + * devlink notify APIs. + * Hence, we must drain them before unregistering the devlink. */ mlx5_drain_fw_reset(dev); + mlx5_drain_health_wq(dev); devlink_unregister(devlink); mlx5_sriov_disable(pdev); mlx5_thermal_uninit(dev); mlx5_crdump_disable(dev); - mlx5_drain_health_wq(dev); mlx5_uninit_one(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 9d735c343a3b..678f0be81375 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/mlx5/driver.h> +#include <linux/mlx5/qp.h> #include "mlx5_core.h" int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in, @@ -122,3 +123,23 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) return mlx5_cmd_exec_in(dev, destroy_psv, in); } EXPORT_SYMBOL(mlx5_core_destroy_psv); + +__be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + u32 mkey; + + if (!MLX5_CAP_GEN(dev, terminate_scatter_list_mkey)) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + if (mlx5_cmd_exec_inout(dev, query_special_contexts, in, out)) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + mkey = MLX5_GET(query_special_contexts_out, out, + terminate_scatter_list_mkey); + return cpu_to_be32(mkey); +} +EXPORT_SYMBOL(mlx5_core_get_terminate_scatter_list_mkey); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index db5687d9fec9..843da89a9035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -141,7 +141,7 @@ static void irq_release(struct mlx5_irq *irq) irq_update_affinity_hint(irq->map.virq, NULL); #ifdef CONFIG_RFS_ACCEL rmap = mlx5_eq_table_get_rmap(pool->dev); - if (rmap && irq->map.index) + if (rmap) irq_cpu_rmap_remove(rmap, irq->map.virq); #endif @@ -232,12 +232,13 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, if (!irq) return ERR_PTR(-ENOMEM); if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) { - /* The vector at index 0 was already allocated. - * Just get the irq number. If dynamic irq is not supported - * vectors have also been allocated. + /* The vector at index 0 is always statically allocated. If + * dynamic irq is not supported all vectors are statically + * allocated. In both cases just get the irq number and set + * the index. */ irq->map.virq = pci_irq_vector(dev->pdev, i); - irq->map.index = 0; + irq->map.index = i; } else { irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc); if (!irq->map.virq) { @@ -570,11 +571,11 @@ int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, af_desc.is_managed = false; for (i = 0; i < nirqs; i++) { + cpumask_clear(&af_desc.mask); cpumask_set_cpu(cpus[i], &af_desc.mask); irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap); if (IS_ERR(irq)) break; - cpumask_clear(&af_desc.mask); irqs[i] = irq; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e2f26d0bc615..0692363cf80e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -63,6 +63,7 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev) struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); struct devlink *devlink = priv_to_devlink(sf_dev->mdev); + mlx5_drain_health_wq(sf_dev->mdev); devlink_unregister(devlink); mlx5_uninit_one(sf_dev->mdev); iounmap(sf_dev->mdev->iseg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c index 13e06a6a6b22..d6947fe13d56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c @@ -213,6 +213,8 @@ struct mlx5dr_ptrn_mgr *mlx5dr_ptrn_mgr_create(struct mlx5dr_domain *dmn) } INIT_LIST_HEAD(&mgr->ptrn_list); + mutex_init(&mgr->modify_hdr_mutex); + return mgr; free_mgr: @@ -237,5 +239,6 @@ void mlx5dr_ptrn_mgr_destroy(struct mlx5dr_ptrn_mgr *mgr) } mlx5dr_icm_pool_destroy(mgr->ptrn_icm_pool); + mutex_destroy(&mgr->modify_hdr_mutex); kfree(mgr); } diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index afa3b92a6905..0d5a41a2ae01 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -245,12 +245,6 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) skb = priv->rx_skb[rx_pi_rem]; - skb_put(skb, datalen); - - skb->ip_summed = CHECKSUM_NONE; /* device did not checksum packet */ - - skb->protocol = eth_type_trans(skb, netdev); - /* Alloc another RX SKB for this same index */ rx_skb = mlxbf_gige_alloc_skb(priv, MLXBF_GIGE_DEFAULT_BUF_SZ, &rx_buf_dma, DMA_FROM_DEVICE); @@ -259,6 +253,13 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) priv->rx_skb[rx_pi_rem] = rx_skb; dma_unmap_single(priv->dev, *rx_wqe_addr, MLXBF_GIGE_DEFAULT_BUF_SZ, DMA_FROM_DEVICE); + + skb_put(skb, datalen); + + skb->ip_summed = CHECKSUM_NONE; /* device did not checksum packet */ + + skb->protocol = eth_type_trans(skb, netdev); + *rx_wqe_addr = rx_buf_dma; } else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MAC_ERR) { priv->stats.rx_mac_errors++; diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 06d6292e09b3..d907727c7b7a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1279,8 +1279,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) if (comp_read < 1) return; - apc->eth_stats.tx_cqes = comp_read; - for (i = 0; i < comp_read; i++) { struct mana_tx_comp_oob *cqe_oob; @@ -1363,8 +1361,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) WARN_ON_ONCE(1); cq->work_done = pkt_transmitted; - - apc->eth_stats.tx_cqes -= pkt_transmitted; } static void mana_post_pkt_rxq(struct mana_rxq *rxq) @@ -1626,15 +1622,11 @@ static void mana_poll_rx_cq(struct mana_cq *cq) { struct gdma_comp *comp = cq->gdma_comp_buf; struct mana_rxq *rxq = cq->rxq; - struct mana_port_context *apc; int comp_read, i; - apc = netdev_priv(rxq->ndev); - comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER); WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER); - apc->eth_stats.rx_cqes = comp_read; rxq->xdp_flush = false; for (i = 0; i < comp_read; i++) { @@ -1646,8 +1638,6 @@ static void mana_poll_rx_cq(struct mana_cq *cq) return; mana_process_rx_cqe(rxq, cq, &comp[i]); - - apc->eth_stats.rx_cqes--; } if (rxq->xdp_flush) diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index a64c81410dc1..0dc78679f620 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -13,11 +13,9 @@ static const struct { } mana_eth_stats[] = { {"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)}, {"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)}, - {"tx_cqes", offsetof(struct mana_ethtool_stats, tx_cqes)}, {"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)}, {"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats, tx_cqe_unknown_type)}, - {"rx_cqes", offsetof(struct mana_ethtool_stats, rx_cqes)}, {"rx_coalesced_err", offsetof(struct mana_ethtool_stats, rx_coalesced_err)}, {"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats, diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 29afaddb598d..aace87139cea 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1485,7 +1485,7 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd if (rswitch_get_num_cur_queues(gq) >= gq->ring_size - 1) { netif_stop_subqueue(ndev, 0); - return ret; + return NETDEV_TX_BUSY; } if (skb_put_padto(skb, ETH_ZLEN)) diff --git a/drivers/net/ethernet/sfc/tc.c b/drivers/net/ethernet/sfc/tc.c index 0327639a628a..c004443c1d58 100644 --- a/drivers/net/ethernet/sfc/tc.c +++ b/drivers/net/ethernet/sfc/tc.c @@ -624,13 +624,12 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, if (!found) { /* We don't care. */ netif_dbg(efx, drv, efx->net_dev, "Ignoring foreign filter that doesn't egdev us\n"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rc = efx_mae_match_check_caps(efx, &match.mask, NULL); if (rc) - goto release; + return rc; if (efx_tc_match_is_encap(&match.mask)) { enum efx_encap_type type; @@ -639,8 +638,7 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, if (type == EFX_ENCAP_TYPE_NONE) { NL_SET_ERR_MSG_MOD(extack, "Egress encap match on unsupported tunnel device"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rc = efx_mae_check_encap_type_supported(efx, type); @@ -648,25 +646,24 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, NL_SET_ERR_MSG_FMT_MOD(extack, "Firmware reports no support for %s encap match", efx_tc_encap_type_name(type)); - goto release; + return rc; } rc = efx_tc_flower_record_encap_match(efx, &match, type, extack); if (rc) - goto release; + return rc; } else { /* This is not a tunnel decap rule, ignore it */ netif_dbg(efx, drv, efx->net_dev, "Ignoring foreign filter without encap match\n"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rule = kzalloc(sizeof(*rule), GFP_USER); if (!rule) { rc = -ENOMEM; - goto release; + goto out_free; } INIT_LIST_HEAD(&rule->acts.list); rule->cookie = tc->cookie; @@ -678,7 +675,7 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, "Ignoring already-offloaded rule (cookie %lx)\n", tc->cookie); rc = -EEXIST; - goto release; + goto out_free; } act = kzalloc(sizeof(*act), GFP_USER); @@ -843,6 +840,7 @@ release: efx_tc_match_action_ht_params); efx_tc_free_action_set_list(efx, &rule->acts, false); } +out_free: kfree(rule); if (match.encap) efx_tc_flower_release_encap_match(efx, match.encap); @@ -899,8 +897,7 @@ static int efx_tc_flower_replace(struct efx_nic *efx, return rc; if (efx_tc_match_is_encap(&match.mask)) { NL_SET_ERR_MSG_MOD(extack, "Ingress enc_key matches not supported"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } if (tc->common.chain_index) { @@ -924,9 +921,9 @@ static int efx_tc_flower_replace(struct efx_nic *efx, if (old) { netif_dbg(efx, drv, efx->net_dev, "Already offloaded rule (cookie %lx)\n", tc->cookie); - rc = -EEXIST; NL_SET_ERR_MSG_MOD(extack, "Rule already offloaded"); - goto release; + kfree(rule); + return -EEXIST; } /* Parse actions */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0fca81507a77..52cab9de05f2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7233,8 +7233,7 @@ int stmmac_dvr_probe(struct device *device, ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM; ndev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_XSK_ZEROCOPY | - NETDEV_XDP_ACT_NDO_XMIT; + NETDEV_XDP_ACT_XSK_ZEROCOPY; ret = stmmac_tc_init(priv, priv); if (!ret) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c index 9d4d8c3dad0a..aa6f16d3df64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c @@ -117,6 +117,9 @@ int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog, return -EOPNOTSUPP; } + if (!prog) + xdp_features_clear_redirect_target(dev); + need_update = !!priv->xdp_prog != !!prog; if (if_running && need_update) stmmac_xdp_release(dev); @@ -131,5 +134,8 @@ int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog, if (if_running && need_update) stmmac_xdp_open(dev); + if (prog) + xdp_features_set_redirect_target(dev, false); + return 0; } diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 2ee80ed140b7..afa1d56d9095 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -119,7 +119,7 @@ enum ipa_status_field_id { }; /* Size in bytes of an IPA packet status structure */ -#define IPA_STATUS_SIZE sizeof(__le32[4]) +#define IPA_STATUS_SIZE sizeof(__le32[8]) /* IPA status structure decoder; looks up field values for a structure */ static u32 ipa_status_extract(struct ipa *ipa, const void *data, diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index 6301a9abfb95..ea1073adc5a1 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -274,13 +274,6 @@ static int gpy_config_init(struct phy_device *phydev) return ret < 0 ? ret : 0; } -static bool gpy_has_broken_mdint(struct phy_device *phydev) -{ - /* At least these PHYs are known to have broken interrupt handling */ - return phydev->drv->phy_id == PHY_ID_GPY215B || - phydev->drv->phy_id == PHY_ID_GPY215C; -} - static int gpy_probe(struct phy_device *phydev) { struct device *dev = &phydev->mdio.dev; @@ -300,8 +293,7 @@ static int gpy_probe(struct phy_device *phydev) phydev->priv = priv; mutex_init(&priv->mbox_lock); - if (gpy_has_broken_mdint(phydev) && - !device_property_present(dev, "maxlinear,use-broken-interrupts")) + if (!device_property_present(dev, "maxlinear,use-broken-interrupts")) phydev->dev_flags |= PHY_F_NO_IRQ; fw_version = phy_read(phydev, PHY_FWV); @@ -659,11 +651,9 @@ static irqreturn_t gpy_handle_interrupt(struct phy_device *phydev) * frame. Therefore, polling is the best we can do and won't do any more * harm. * It was observed that this bug happens on link state and link speed - * changes on a GPY215B and GYP215C independent of the firmware version - * (which doesn't mean that this list is exhaustive). + * changes independent of the firmware version. */ - if (gpy_has_broken_mdint(phydev) && - (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC))) { + if (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC)) { reg = gpy_mbox_read(phydev, REG_GPIO0_OUT); if (reg < 0) { phy_error(phydev); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 571e37e67f9c..f1865d047971 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1325,7 +1325,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2001, 0x7e3d, 4)}, /* D-Link DWM-222 A2 */ {QMI_FIXED_INTF(0x2020, 0x2031, 4)}, /* Olicard 600 */ {QMI_FIXED_INTF(0x2020, 0x2033, 4)}, /* BroadMobi BM806U */ - {QMI_FIXED_INTF(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ + {QMI_QUIRK_SET_DTR(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ diff --git a/drivers/nfc/nfcsim.c b/drivers/nfc/nfcsim.c index 44eeb17ae48d..a55381f80cd6 100644 --- a/drivers/nfc/nfcsim.c +++ b/drivers/nfc/nfcsim.c @@ -336,10 +336,6 @@ static struct dentry *nfcsim_debugfs_root; static void nfcsim_debugfs_init(void) { nfcsim_debugfs_root = debugfs_create_dir("nfcsim", NULL); - - if (!nfcsim_debugfs_root) - pr_err("Could not create debugfs entry\n"); - } static void nfcsim_debugfs_remove(void) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a4c4f737f9c1..94d2be5848ae 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1093,6 +1093,7 @@ void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); +__be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index cd386aa7c7cc..9eef19972845 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -347,10 +347,8 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; - u64 tx_cqes; u64 tx_cqe_err; u64 tx_cqe_unknown_type; - u64 rx_cqes; u64 rx_coalesced_err; u64 rx_cqe_unknown_type; }; diff --git a/include/net/sock.h b/include/net/sock.h index 656ea89f60ff..b418425d7230 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -336,6 +336,7 @@ struct sk_filter; * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start + * @sk_wait_pending: number of threads blocked on this socket * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available @@ -428,6 +429,7 @@ struct sock { unsigned int sk_napi_id; #endif int sk_rcvbuf; + int sk_wait_pending; struct sk_filter __rcu *sk_filter; union { @@ -1174,6 +1176,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc; \ + __sk->sk_wait_pending++; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ @@ -1183,6 +1186,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } \ sched_annotate_sleep(); \ lock_sock(__sk); \ + __sk->sk_wait_pending--; \ __rc = __condition; \ __rc; \ }) diff --git a/include/net/tcp.h b/include/net/tcp.h index 18a038d16434..5066e4586cf0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -632,6 +632,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); +void tcp_sack_compress_send_ack(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 653901a1bf75..41de3a2f29e1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2385,6 +2385,37 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; + + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } + + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; + } } if (tb[IFLA_AF_SPEC]) { @@ -2858,11 +2889,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > dev->tso_max_size) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; @@ -2872,11 +2898,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; @@ -2895,11 +2916,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); - if (max_size > dev->tso_max_size) { - err = -EINVAL; - goto errout; - } - if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; @@ -3285,6 +3301,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; + int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); @@ -3320,13 +3337,18 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (!dev) return ERR_PTR(-ENOMEM); + err = validate_linkmsg(dev, tb, extack); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - int err; err = dev_validate_mtu(dev, mtu, extack); if (err) { diff --git a/net/core/sock.c b/net/core/sock.c index 5440e67bcfe3..24f2761bdb1d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2381,7 +2381,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; @@ -2400,6 +2399,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } } sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c4aab3aacbd8..4a76ebf793b8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -586,6 +586,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; + sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -601,6 +602,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; + sk->sk_wait_pending--; return timeo; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65ad4251f6fd..1386787eaf1a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1142,6 +1142,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); + newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a60f6f4e7cd9..8d20d9221238 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3081,6 +3081,12 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -4072,7 +4078,8 @@ int do_tcp_getsockopt(struct sock *sk, int level, switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + if (tp->rx_opt.user_mss && + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 61b6710f337a..bf8b22218dd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4530,7 +4530,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static void tcp_sack_compress_send_ack(struct sock *sk) +void tcp_sack_compress_send_ack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b839c2f91292..39eb947fe392 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -290,9 +290,19 @@ static int tcp_write_timeout(struct sock *sk) void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + /* Handling the sack compression case */ + if (tp->compressed_ack) { + tcp_mstamp_refresh(tp); + tcp_sack_compress_send_ack(sk); + return; + } + + if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { @@ -312,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } - tcp_mstamp_refresh(tcp_sk(sk)); + tcp_mstamp_refresh(tp); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08dc53f56bc2..67311e7d5b21 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -90,8 +90,8 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; - msk->first = ssock->sk; - msk->subflow = ssock; + WRITE_ONCE(msk->first, ssock->sk); + WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -603,7 +603,7 @@ static bool mptcp_check_data_fin(struct sock *sk) WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { @@ -825,6 +825,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } +static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) +{ + mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); +} + static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -839,6 +846,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_sockopt_sync_locked(msk, ssk); + mptcp_subflow_joined(msk, ssk); return true; } @@ -910,7 +918,7 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ sk->sk_data_ready(sk); @@ -1702,7 +1710,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; - msk->connect_flags = O_NONBLOCK; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; @@ -2283,7 +2290,7 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) { if (msk->subflow) { iput(SOCK_INODE(msk->subflow)); - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); } } @@ -2420,7 +2427,7 @@ out_release: sock_put(ssk); if (ssk == msk->first) - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); out: if (ssk == msk->last_snd) @@ -2527,7 +2534,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) } inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); @@ -2721,7 +2728,7 @@ static int __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); @@ -2959,7 +2966,7 @@ bool __mptcp_close(struct sock *sk, long timeout) bool do_cancel_work = false; int subflows_alive = 0; - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_listen_inuse_dec(sk); @@ -3039,7 +3046,7 @@ static void mptcp_close(struct sock *sk, long timeout) sock_put(sk); } -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -3102,7 +3109,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } @@ -3116,9 +3123,10 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req) +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); @@ -3137,7 +3145,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) @@ -3150,10 +3158,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; sock_reset_flag(nsk, SOCK_RCU_FREE); - /* will be fully established after successful MPC subflow creation */ - inet_sk_state_store(nsk, TCP_SYN_RECV); - security_inet_csk_clone(nsk, req); + + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store(nsk, TCP_ESTABLISHED); + + /* The msk maintain a ref to each subflow in the connections list */ + WRITE_ONCE(msk->first, ssk); + list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list); + sock_hold(ssk); + + /* new mpc subflow takes ownership of the newly + * created mptcp socket + */ + mptcp_token_accept(subflow_req, msk); + + /* set msk addresses early to ensure mptcp_pm_get_local_id() + * uses the correct data + */ + mptcp_copy_inaddrs(nsk, ssk); + mptcp_propagate_sndbuf(nsk, ssk); + + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ @@ -3185,7 +3213,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, struct socket *listener; struct sock *newsk; - listener = msk->subflow; + listener = READ_ONCE(msk->subflow); if (WARN_ON_ONCE(!listener)) { *err = -EINVAL; return NULL; @@ -3465,14 +3493,16 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!list_empty(&subflow->node)) - goto out; + /* active subflow, already present inside the conn_list */ + if (!list_empty(&subflow->node)) { + mptcp_subflow_joined(msk, ssk); + return true; + } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; - /* active connections are already on conn_list. - * If we can't acquire msk socket lock here, let the release callback + /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); @@ -3495,11 +3525,6 @@ err_prohibited: return false; } - subflow->map_seq = READ_ONCE(msk->ack_seq); - WRITE_ONCE(msk->allow_infinite_fallback, false); - -out: - mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } @@ -3617,9 +3642,9 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * acquired the subflow socket lock, too. */ if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); else - err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; /* on successful connect, the msk state will be moved to established by @@ -3632,12 +3657,10 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) mptcp_copy_inaddrs(sk, ssock->sk); - /* unblocking connect, mptcp-level inet_stream_connect will error out - * without changing the socket state, update it here. + /* silence EINPROGRESS and let the caller inet_stream_connect + * handle the connection in progress */ - if (err == -EINPROGRESS) - sk->sk_socket->state = ssock->state; - return err; + return 0; } static struct proto mptcp_prot = { @@ -3696,18 +3719,6 @@ unlock: return err; } -static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - int ret; - - lock_sock(sock->sk); - mptcp_sk(sock->sk)->connect_flags = flags; - ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); - release_sock(sock->sk); - return ret; -} - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -3751,10 +3762,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - /* buggy applications can call accept on socket states other then LISTEN + /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ - ssock = msk->subflow; + ssock = READ_ONCE(msk->subflow); if (!ssock) return -EINVAL; @@ -3800,9 +3811,6 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) - return EPOLLOUT | EPOLLWRNORM; - if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@ -3820,6 +3828,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; + u8 shutdown; int state; msk = mptcp_sk(sk); @@ -3828,23 +3837,30 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + struct socket *ssock = READ_ONCE(msk->subflow); + + if (WARN_ON_ONCE(!ssock || !ssock->sk)) return 0; - return inet_csk_listen_poll(msk->subflow->sk); + return inet_csk_listen_poll(ssock->sk); } + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - mask |= mptcp_check_writeable(msk); + if (shutdown & SEND_SHUTDOWN) + mask |= EPOLLOUT | EPOLLWRNORM; + else + mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) - mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); @@ -3859,7 +3875,7 @@ static const struct proto_ops mptcp_stream_ops = { .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, @@ -3954,7 +3970,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 2d7b2c80a164..c5255258bfb3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -297,7 +297,6 @@ struct mptcp_sock { nodelay:1, fastopening:1, in_accept_queue:1; - int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -306,7 +305,11 @@ struct mptcp_sock { struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable + * The mptcp ops can safely dereference, using suitable + * ONCE annotation, the subflow outside the socket + * lock as such sock is freed after close(). + */ struct sock *first; struct mptcp_pm_data pm; struct { @@ -613,7 +616,6 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -683,9 +685,10 @@ void __init mptcp_proto_init(void); int __init mptcp_proto_v6_init(void); #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req); +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ba065b66551a..4688daa6b38b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -815,38 +815,12 @@ create_child: ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { - ctx->conn = mptcp_sk_clone(listener->conn, &mp_opt, req); + ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; owner = mptcp_sk(ctx->conn); - - /* this can't race with mptcp_close(), as the msk is - * not yet exposted to user-space - */ - inet_sk_state_store(ctx->conn, TCP_ESTABLISHED); - - /* record the newly created socket as the first msk - * subflow, but don't link it yet into conn_list - */ - WRITE_ONCE(owner->first, child); - - /* new mpc subflow takes ownership of the newly - * created mptcp socket - */ - owner->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(owner, child, 1); - mptcp_token_accept(subflow_req, owner); - - /* set msk addresses early to ensure mptcp_pm_get_local_id() - * uses the correct data - */ - mptcp_copy_inaddrs(ctx->conn, child); - mptcp_propagate_sndbuf(ctx->conn, child); - - mptcp_rcv_space_init(owner, child); - list_add(&ctx->node, &owner->conn_list); - sock_hold(child); /* with OoO packets we can reach here without ingress * mpc option diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c87804112d0c..3a1e0fd5bf14 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1779,7 +1779,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, break; } } - if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 3f99b432ea70..e2d2af924cff 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -123,7 +123,7 @@ void nr_write_internal(struct sock *sk, int frametype) unsigned char *dptr; int len, timeout; - len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + len = NR_TRANSPORT_LEN; switch (frametype & 0x0F) { case NR_CONNREQ: @@ -141,7 +141,8 @@ void nr_write_internal(struct sock *sk, int frametype) return; } - if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + skb = alloc_skb(NR_NETWORK_LEN + len, GFP_ATOMIC); + if (!skb) return; /* @@ -149,7 +150,7 @@ void nr_write_internal(struct sock *sk, int frametype) */ skb_reserve(skb, NR_NETWORK_LEN); - dptr = skb_put(skb, skb_tailroom(skb)); + dptr = skb_put(skb, len); switch (frametype & 0x0F) { case NR_CONNREQ: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 94c6a1ffa459..a2dbeb264f26 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3201,6 +3201,9 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, lock_sock(sk); spin_lock(&po->bind_lock); + if (!proto) + proto = po->num; + rcu_read_lock(); if (po->fanout) { @@ -3299,7 +3302,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; - return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -3316,8 +3319,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : pkt_sk(sk)->num); + return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { diff --git a/net/packet/diag.c b/net/packet/diag.c index d0c4eda4cdc6..f6b200cb3c06 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -143,7 +143,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, rp = nlmsg_data(nlh); rp->pdiag_family = AF_PACKET; rp->pdiag_type = sk->sk_type; - rp->pdiag_num = ntohs(po->num); + rp->pdiag_num = ntohs(READ_ONCE(po->num)); rp->pdiag_ino = sk_ino; sock_diag_save_cookie(sk, rp->pdiag_cookie); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 31f738d65f1c..da0b3b5157d5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -980,6 +980,7 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); ret = -ENOMEM; + rxrpc_gen_version_string(); rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5d44dc08f66d..e8e14c6f904d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1068,6 +1068,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, /* * local_event.c */ +void rxrpc_gen_version_string(void); void rxrpc_send_version_request(struct rxrpc_local *local, struct rxrpc_host_header *hdr, struct sk_buff *skb); diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 5e69ea6b233d..993c69f97488 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -16,7 +16,16 @@ #include <generated/utsrelease.h> #include "ar-internal.h" -static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; +static char rxrpc_version_string[65]; // "linux-" UTS_RELEASE " AF_RXRPC"; + +/* + * Generate the VERSION packet string. + */ +void rxrpc_gen_version_string(void) +{ + snprintf(rxrpc_version_string, sizeof(rxrpc_version_string), + "linux-%.49s AF_RXRPC", UTS_RELEASE); +} /* * Reply to a version request diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9dbc43388e57..815c3e416bc5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1153,6 +1153,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, if (option_len > sizeof(struct geneve_opt)) data_len = option_len - sizeof(struct geneve_opt); + if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4) + return -ERANGE; + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; memset(opt, 0xff, option_len); opt->length = data_len / 4; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index fdb8f429333d..014209b1dd58 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1252,7 +1252,12 @@ static struct Qdisc *qdisc_create(struct net_device *dev, sch->parent = parent; if (handle == TC_H_INGRESS) { - sch->flags |= TCQ_F_INGRESS; + if (!(sch->flags & TCQ_F_INGRESS)) { + NL_SET_ERR_MSG(extack, + "Specified parent ID is reserved for ingress and clsact Qdiscs"); + err = -EINVAL; + goto err_out3; + } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { @@ -1591,11 +1596,20 @@ replay: NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } + if (q->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot regraft ingress or clsact Qdiscs"); + return -EINVAL; + } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } + if (clid == TC_H_INGRESS) { + NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); + return -EINVAL; + } qdisc_refcount_inc(q); goto graft; } else { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 84838128b9c5..e43a45499372 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -80,6 +80,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); int err; + if (sch->parent != TC_H_INGRESS) + return -EOPNOTSUPP; + net_inc_ingress_queue(); mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); @@ -101,6 +104,9 @@ static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_INGRESS) + return; + tcf_block_put_ext(q->block, sch, &q->block_info); net_dec_ingress_queue(); } @@ -134,7 +140,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -219,6 +225,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); int err; + if (sch->parent != TC_H_CLSACT) + return -EOPNOTSUPP; + net_inc_ingress_queue(); net_inc_egress_queue(); @@ -248,6 +257,9 @@ static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_CLSACT) + return; + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); @@ -269,7 +281,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a0840b8c935b..7a8d9163d186 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -578,7 +578,10 @@ static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, { struct smc_buf_desc *buf_next; - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + if (!buf_pos) + return _smc_llc_get_next_rmb(lgr, buf_lst); + + if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { (*buf_lst)++; return _smc_llc_get_next_rmb(lgr, buf_lst); } @@ -614,6 +617,8 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, goto out; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); for (i = 0; i < ext->num_rkeys; i++) { + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); if (!buf_pos) break; rmb = buf_pos; @@ -623,8 +628,6 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); - while (buf_pos && !(buf_pos)->used) - buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); } len += i * sizeof(ext->rt[0]); out: diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index da95abbb7ea3..f37f4a0fcd3c 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -20,7 +20,9 @@ static void tls_strp_abort_strp(struct tls_strparser *strp, int err) strp->stopped = 1; /* Report an error on the lower socket */ - strp->sk->sk_err = -err; + WRITE_ONCE(strp->sk->sk_err, -err); + /* Paired with smp_rmb() in tcp_poll() */ + smp_wmb(); sk_error_report(strp->sk); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6e6a7c37d685..1a53c8f481e9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -70,7 +70,9 @@ noinline void tls_err_abort(struct sock *sk, int err) { WARN_ON_ONCE(err >= 0); /* sk->sk_err should contain a positive error code. */ - sk->sk_err = -err; + WRITE_ONCE(sk->sk_err, -err); + /* Paired with smp_rmb() in tcp_poll() */ + smp_wmb(); sk_error_report(sk); } diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index aa77bcae4807..3144f33196be 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -591,8 +591,9 @@ class YnlFamily(SpecFamily): print('Unexpected message: ' + repr(gm)) continue - rsp.append(self._decode(gm.raw_attrs, op.attr_set.name) - | gm.fixed_header_attrs) + rsp_msg = self._decode(gm.raw_attrs, op.attr_set.name) + rsp_msg.update(gm.fixed_header_attrs) + rsp.append(rsp_msg) if not rsp: return None diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 43a723626126..7b936a926859 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -9,7 +9,7 @@ TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq -TEST_FILES := settings +TEST_FILES := mptcp_lib.sh settings EXTRA_CLEAN := *.pcap diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index ef628b16fe9b..4eacdb1ab962 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + sec=$(date +%s) rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns="ns1-$rndh" @@ -31,6 +33,8 @@ cleanup() ip netns del $ns } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index a43d3e2f59bb..c1f7bac19942 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + time_start=$(date +%s) optstring="S:R:d:e:l:r:h4cm:f:tC" @@ -141,6 +143,8 @@ cleanup() done } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 26310c17b4c6..96f63172b8fe 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -10,6 +10,8 @@ # because it's invoked by variable name, see how the "tests" array is used #shellcheck disable=SC2317 +. "$(dirname "${0}")/mptcp_lib.sh" + ret=0 sin="" sinfail="" @@ -17,6 +19,7 @@ sout="" cin="" cinfail="" cinsent="" +tmpfile="" cout="" capout="" ns1="" @@ -136,6 +139,8 @@ cleanup_partial() check_tools() { + mptcp_lib_check_mptcp + if ! ip -Version &> /dev/null; then echo "SKIP: Could not run test without ip tool" exit $ksft_skip @@ -175,6 +180,7 @@ cleanup() { rm -f "$cin" "$cout" "$sinfail" rm -f "$sin" "$sout" "$cinsent" "$cinfail" + rm -f "$tmpfile" rm -rf $evts_ns1 $evts_ns2 cleanup_partial } @@ -383,9 +389,16 @@ check_transfer() fail_test return 1 fi - bytes="--bytes=${bytes}" + + # note: BusyBox's "cmp" command doesn't support --bytes + tmpfile=$(mktemp) + head --bytes="$bytes" "$in" > "$tmpfile" + mv "$tmpfile" "$in" + head --bytes="$bytes" "$out" > "$tmpfile" + mv "$tmpfile" "$out" + tmpfile="" fi - cmp -l "$in" "$out" ${bytes} | while read -r i a b; do + cmp -l "$in" "$out" | while read -r i a b; do local sum=$((0${a} + 0${b})) if [ $check_invert -eq 0 ] || [ $sum -ne $((0xff)) ]; then echo "[ FAIL ] $what does not match (in, out):" diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh new file mode 100644 index 000000000000..3286536b79d5 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -0,0 +1,40 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +readonly KSFT_FAIL=1 +readonly KSFT_SKIP=4 + +# SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var can be set when validating all +# features using the last version of the kernel and the selftests to make sure +# a test is not being skipped by mistake. +mptcp_lib_expect_all_features() { + [ "${SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES:-}" = "1" ] +} + +# $1: msg +mptcp_lib_fail_if_expected_feature() { + if mptcp_lib_expect_all_features; then + echo "ERROR: missing feature: ${*}" + exit ${KSFT_FAIL} + fi + + return 1 +} + +# $1: file +mptcp_lib_has_file() { + local f="${1}" + + if [ -f "${f}" ]; then + return 0 + fi + + mptcp_lib_fail_if_expected_feature "${f} file not found" +} + +mptcp_lib_check_mptcp() { + if ! mptcp_lib_has_file "/proc/sys/net/mptcp/enabled"; then + echo "SKIP: MPTCP support is not available" + exit ${KSFT_SKIP} + fi +} diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 1b70c0a304ce..ff5adbb9c7f2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + ret=0 sin="" sout="" @@ -84,6 +86,8 @@ cleanup() rm -f "$sin" "$sout" } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 89839d1ff9d8..32f7533e0919 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + ksft_skip=4 ret=0 @@ -34,6 +36,8 @@ cleanup() ip netns del $ns1 } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 9f22f7e5027d..36a3c9d92e20 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + sec=$(date +%s) rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns1="ns1-$rndh" @@ -34,6 +36,8 @@ cleanup() done } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index b1eb7bce599d..8092399d911f 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -1,6 +1,10 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Cannot not run test without ip tool" |