diff options
author | David S. Miller <davem@davemloft.net> | 2022-01-02 12:07:39 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2022-01-02 12:07:39 +0000 |
commit | ab6dd952b2d044d04a9906a997d16f1e6e4db48d (patch) | |
tree | 75e91032fac8767eeb60d9e822681587588662a6 | |
parent | e63a02348958cd7cc8c8401c94de57ad97b5d06c (diff) | |
parent | a838f5084828ac987af0903c523d6a2f7882705c (diff) |
Merge branch 'smc-RDMA-net-namespace'
Tony Lu says:
====================
RDMA device net namespace support for SMC
This patch set introduces net namespace support for linkgroups.
Path 1 is the main approach to implement net ns support.
Path 2 - 4 are the additional modifications to let us know the netns.
Also, I will submit changes of smc-tools to github later.
Currently, smc doesn't support net namespace isolation. The ibdevs
registered to smc are shared for all linkgroups and connections. When
running applications in different net namespaces, such as container
environment, applications should only use the ibdevs that belongs to the
same net namespace.
This adds a new field, net, in smc linkgroup struct. During first
contact, it checks and find the linkgroup has same net namespace, if
not, it is going to create and initialized the net field with first
link's ibdev net namespace. When finding the rdma devices, it also checks
the sk net device's and ibdev's net namespaces. After net namespace
destroyed, the net device and ibdev move to root net namespace,
linkgroups won't be matched, and wait for lgr free.
If rdma net namespace exclusive mode is not enabled, it behaves as
before.
Steps to enable and test net namespaces:
1. enable RDMA device net namespace exclusive support
rdma system set netns exclusive # default is shared
2. create new net namespace, move and initialize them
ip netns add test1
rdma dev set mlx5_1 netns test1
ip link set dev eth2 netns test1
ip netns exec test1 ip link set eth2 up
ip netns exec test1 ip addr add ${HOST_IP}/26 dev eth2
3. setup server and client, connect N <-> M
ip netns exec test1 smc_run sockperf server --tcp # server
ip netns exec test1 smc_run sockperf pp --tcp -i ${SERVER_IP} # client
4. netns isolated linkgroups (2 * 2 mesh) with their own linkgroups
- server
LG-ID LG-Role LG-Type VLAN #Conns PNET-ID
00000100 SERV SINGLE 0 0
00000200 SERV SINGLE 0 0
00000300 SERV SINGLE 0 0
00000400 SERV SINGLE 0 0
- client
LG-ID LG-Role LG-Type VLAN #Conns PNET-ID
00000100 CLNT SINGLE 0 0
00000200 CLNT SINGLE 0 0
00000300 CLNT SINGLE 0 0
00000400 CLNT SINGLE 0 0
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/uapi/linux/smc.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/smc_diag.h | 11 | ||||
-rw-r--r-- | net/smc/smc_core.c | 31 | ||||
-rw-r--r-- | net/smc/smc_core.h | 2 | ||||
-rw-r--r-- | net/smc/smc_diag.c | 16 | ||||
-rw-r--r-- | net/smc/smc_ib.h | 7 | ||||
-rw-r--r-- | net/smc/smc_llc.c | 19 | ||||
-rw-r--r-- | net/smc/smc_pnet.c | 21 | ||||
-rw-r--r-- | net/smc/smc_tracepoint.h | 23 |
9 files changed, 92 insertions, 40 deletions
diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 20f33b27787f..6c2874fd2c00 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -119,6 +119,8 @@ enum { SMC_NLA_LGR_R_CONNS_NUM, /* u32 */ SMC_NLA_LGR_R_V2_COMMON, /* nest */ SMC_NLA_LGR_R_V2, /* nest */ + SMC_NLA_LGR_R_NET_COOKIE, /* u64 */ + SMC_NLA_LGR_R_PAD, /* flag */ __SMC_NLA_LGR_R_MAX, SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1 }; diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 8cb3a6fef553..c7008d87f1a4 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -84,11 +84,12 @@ struct smc_diag_conninfo { /* SMC_DIAG_LINKINFO */ struct smc_diag_linkinfo { - __u8 link_id; /* link identifier */ - __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ - __u8 ibport; /* RDMA device port number */ - __u8 gid[40]; /* local GID */ - __u8 peer_gid[40]; /* peer GID */ + __u8 link_id; /* link identifier */ + __u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */ + __u8 ibport; /* RDMA device port number */ + __u8 gid[40]; /* local GID */ + __u8 peer_gid[40]; /* peer GID */ + __aligned_u64 net_cookie; /* RDMA device net namespace */ }; struct smc_diag_lgrinfo { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 20d1417f0a68..4ecf1c702b57 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -348,6 +348,9 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, + lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) + goto errattr; memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) @@ -897,6 +900,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) smc_wr_free_lgr_mem(lgr); goto free_wq; } + lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -1548,9 +1552,9 @@ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) lgr_type = "ASYMMETRIC_LOCAL"; break; } - pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: " + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, - lgr_type, lgr->pnet_id); + lgr->net->net_cookie, lgr_type, lgr->pnet_id); } /* set new lgr type and tag a link as asymmetric */ @@ -1585,7 +1589,8 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, SMC_MAX_PNETID_LEN) || lgr->type == SMC_LGR_SYMMETRIC || - lgr->type == SMC_LGR_ASYMMETRIC_PEER) + lgr->type == SMC_LGR_ASYMMETRIC_PEER || + !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; /* trigger local add link processing */ @@ -1743,8 +1748,10 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, u8 peer_systemid[], u8 peer_gid[], u8 peer_mac_v1[], - enum smc_lgr_role role, u32 clcqpn) + enum smc_lgr_role role, u32 clcqpn, + struct net *net) { + struct smc_link *lnk; int i; if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || @@ -1752,12 +1759,17 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, return false; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (!smc_link_active(&lgr->lnk[i])) + lnk = &lgr->lnk[i]; + + if (!smc_link_active(lnk)) continue; - if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && - !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) && + /* use verbs API to check netns, instead of lgr->net */ + if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) + return false; + if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && + !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && (smcr_version == SMC_V2 || - !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN))) + !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; @@ -1773,6 +1785,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; @@ -1799,7 +1812,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, - ini->ib_clcqpn)) && + ini->ib_clcqpn, net)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d63b08274197..1e4e90faf657 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -306,6 +306,8 @@ struct smc_link_group { u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; + /* net namespace */ + struct net *net; }; struct { /* SMC-D */ u64 peer_gid; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index c952986a6aca..7c8dad28c18d 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -145,19 +145,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (smc->conn.lgr && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { + struct smc_link *link = smc->conn.lnk; + struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net); + struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, - .lnk[0].ibport = smc->conn.lnk->ibport, - .lnk[0].link_id = smc->conn.lnk->link_id, + .lnk[0].ibport = link->ibport, + .lnk[0].link_id = link->link_id, + .lnk[0].net_cookie = net->net_cookie, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, - sizeof(smc->conn.lnk->smcibdev->ibdev->name)); - smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lnk->gid); - smc_gid_be16_convert(linfo.lnk[0].peer_gid, - smc->conn.lnk->peer_gid); + sizeof(link->smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index bfa1c6bf6313..5d8b49c57f50 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -69,6 +69,13 @@ static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) return cpu_to_be32(INADDR_NONE); } +static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev) +{ + if (smcibdev && smcibdev->ibdev) + return read_pnet(&smcibdev->ibdev->coredev.rdma_net); + return NULL; +} + struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 3e9fd8a3124c..c4d057b2941d 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -242,9 +242,10 @@ static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, } /* drop parallel or already-in-progress llc requests */ if (flow_type != msg_type) - pr_warn_once("smc: SMC-R lg %*phN dropped parallel " + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel " "LLC msg: msg %d flow %d role %d\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, qentry->msg.raw.hdr.common.type, flow_type, lgr->role); kfree(qentry); @@ -359,9 +360,10 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, smc_llc_flow_qentry_clr(flow)); return NULL; } - pr_warn_once("smc: SMC-R lg %*phN dropped unexpected LLC msg: " + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: " "msg %d exp %d flow %d role %d flags %x\n", - SMC_LGR_ID_SIZE, &lgr->id, rcv_msg, exp_msg, + SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, + rcv_msg, exp_msg, flow->type, lgr->role, flow->qentry->msg.raw.hdr.flags); smc_llc_flow_qentry_del(flow); @@ -1816,8 +1818,9 @@ finish: static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) { - pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: " - "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type); + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, type); smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); smc_lgr_terminate_sched(lgr); } @@ -2146,9 +2149,10 @@ int smc_llc_link_init(struct smc_link *link) void smc_llc_link_active(struct smc_link *link) { - pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, " + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, " "peerid %*phN, ibdev %s, ibport %d\n", SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, SMC_LGR_ID_SIZE, &link->link_uid, SMC_LGR_ID_SIZE, &link->peer_link_uid, link->smcibdev->ibdev->name, link->ibport); @@ -2164,9 +2168,10 @@ void smc_llc_link_active(struct smc_link *link) void smc_llc_link_clear(struct smc_link *link, bool log) { if (log) - pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN" + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN" ", peerid %*phN, ibdev %s, ibport %d\n", SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, SMC_LGR_ID_SIZE, &link->link_uid, SMC_LGR_ID_SIZE, &link->peer_link_uid, link->smcibdev->ibdev->name, link->ibport); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index e171cc6483f8..db9825c01e0a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -977,14 +977,16 @@ static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, - struct smc_ib_device *known_dev) + struct smc_ib_device *known_dev, + struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - if (ibdev == known_dev) + if (ibdev == known_dev || + !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) @@ -1001,12 +1003,14 @@ out: mutex_unlock(&smc_ib_devices.mutex); } -/* find alternate roce device with same pnet_id and vlan_id */ +/* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { - _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); + struct net *net = lgr->net; + + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its @@ -1015,6 +1019,7 @@ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { + struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); @@ -1022,6 +1027,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct net_device *ndev; int i; + /* check rdma net namespace */ + if (!rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; @@ -1052,15 +1061,17 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net *net; ndev = pnet_find_base_ndev(ndev); + net = dev_net(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index ec17f29646f5..9fc5e586d24a 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -22,6 +22,7 @@ TRACE_EVENT(smc_switch_to_fallback, TP_STRUCT__entry( __field(const void *, sk) __field(const void *, clcsk) + __field(u64, net_cookie) __field(int, fallback_rsn) ), @@ -31,11 +32,13 @@ TRACE_EVENT(smc_switch_to_fallback, __entry->sk = sk; __entry->clcsk = clcsk; + __entry->net_cookie = sock_net(sk)->net_cookie; __entry->fallback_rsn = fallback_rsn; ), - TP_printk("sk=%p clcsk=%p fallback_rsn=%d", - __entry->sk, __entry->clcsk, __entry->fallback_rsn) + TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", + __entry->sk, __entry->clcsk, + __entry->net_cookie, __entry->fallback_rsn) ); DECLARE_EVENT_CLASS(smc_msg_event, @@ -46,19 +49,23 @@ DECLARE_EVENT_CLASS(smc_msg_event, TP_STRUCT__entry( __field(const void *, smc) + __field(u64, net_cookie) __field(size_t, len) __string(name, smc->conn.lnk->ibname) ), TP_fast_assign( + const struct sock *sk = &smc->sk; + __entry->smc = smc; + __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; __assign_str(name, smc->conn.lnk->ibname); ), - TP_printk("smc=%p len=%zu dev=%s", - __entry->smc, __entry->len, - __get_str(name)) + TP_printk("smc=%p net=%llu len=%zu dev=%s", + __entry->smc, __entry->net_cookie, + __entry->len, __get_str(name)) ); DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, @@ -84,6 +91,7 @@ TRACE_EVENT(smcr_link_down, TP_STRUCT__entry( __field(const void *, lnk) __field(const void *, lgr) + __field(u64, net_cookie) __field(int, state) __string(name, lnk->ibname) __field(void *, location) @@ -94,13 +102,14 @@ TRACE_EVENT(smcr_link_down, __entry->lnk = lnk; __entry->lgr = lgr; + __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; __assign_str(name, lnk->ibname); __entry->location = location; ), - TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS", - __entry->lnk, __entry->lgr, + TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", + __entry->lnk, __entry->lgr, __entry->net_cookie, __entry->state, __get_str(name), __entry->location) ); |