diff options
author | David S. Miller <davem@davemloft.net> | 2021-04-16 15:23:10 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-04-16 15:23:10 -0700 |
commit | c133acf38ca4ab498d0bfa25f3c218c263f6664a (patch) | |
tree | f5e5bc85146e250aebe7c242338e6e7252bf8620 | |
parent | a1150a04b7e8caee235e38996e042e1bcb1a6574 (diff) | |
parent | dc65fe82fb07e610e03a9b05bd445f46f93175f5 (diff) |
Merge branch 'mptcp-socket-options'
Mat Martineau says:
====================
mptcp: Improve socket option handling
MPTCP sockets have previously had limited socket option support. The
architecture of MPTCP sockets (one userspace-facing MPTCP socket that
manages one or more in-kernel TCP subflow sockets) adds complexity for
passing options through to lower levels. This patch set adds MPTCP
support for socket options commonly used with TCP.
Patch 1 reverts an interim socket option fix (a socket option blocklist)
that was merged in the net tree for v5.12.
Patch 2 moves the socket option code to a separate file, with no
functional changes.
Patch 3 adds an allowlist for socket options that are known to function
with MPTCP. Later patches in this set add more allowed options.
Patches 4 and 5 add infrastructure for syncing MPTCP-level options with
the TCP subflows.
Patches 6-12 add support for specific socket options.
Patch 13 adds a socket option self test.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/mptcp/Makefile | 2 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 219 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 16 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 756 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 5 | ||||
-rw-r--r-- | tools/testing/selftests/net/mptcp/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/net/mptcp/mptcp_connect.c | 23 | ||||
-rwxr-xr-x | tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 276 |
8 files changed, 1122 insertions, 177 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a611968be4d7..d2642c012a6a 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o + mib.o pm_netlink.o sockopt.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8009b3f8e4c1..073e20078ed0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -90,16 +90,6 @@ static bool mptcp_is_tcpsk(struct sock *sk) return false; } -static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) -{ - sock_owned_by_me((const struct sock *)msk); - - if (likely(!__mptcp_check_fallback(msk))) - return NULL; - - return msk->first; -} - static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -740,18 +730,47 @@ wake: sk->sk_data_ready(sk); } -void __mptcp_flush_join_list(struct mptcp_sock *msk) +static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; + bool ret = false; if (likely(list_empty(&msk->join_list))) - return; + return false; spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) + list_for_each_entry(subflow, &msk->join_list, node) { + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); + if (READ_ONCE(msk->setsockopt_seq) != sseq) + ret = true; + } list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); + + return ret; +} + +void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(!mptcp_do_flush_join_list(msk))) + return; + + if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); +} + +static void mptcp_flush_join_list(struct mptcp_sock *msk) +{ + bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); + + might_sleep(); + + if (!mptcp_do_flush_join_list(msk) && !sync_needed) + return; + + mptcp_sockopt_sync_all(msk); } static bool mptcp_timer_pending(struct sock *sk) @@ -1467,7 +1486,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* try to keep the subflow socket lock across @@ -1893,7 +1912,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -2317,7 +2336,7 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); @@ -2380,6 +2399,9 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); + + tcp_assign_congestion_control(sk); + return 0; } @@ -2517,7 +2539,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2573,6 +2595,8 @@ static void __mptcp_destroy_sock(struct sock *sk) WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); + + tcp_cleanup_congestion_control(sk); sk_refcnt_debug_release(sk); mptcp_dispose_initial_subflow(msk); sock_put(sk); @@ -2654,7 +2678,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_flush_join_list(msk); + mptcp_do_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2703,6 +2728,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->snd_nxt = msk->write_seq; msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2811,161 +2837,6 @@ static void mptcp_destroy(struct sock *sk) sk_sockets_allocated_dec(sk); } -static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - struct socket *ssock; - int ret; - - switch (optname) { - case SO_REUSEPORT: - case SO_REUSEADDR: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); - if (ret == 0) { - if (optname == SO_REUSEPORT) - sk->sk_reuseport = ssock->sk->sk_reuseport; - else if (optname == SO_REUSEADDR) - sk->sk_reuse = ssock->sk->sk_reuse; - } - release_sock(sk); - return ret; - } - - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); -} - -static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - int ret = -EOPNOTSUPP; - struct socket *ssock; - - switch (optname) { - case IPV6_V6ONLY: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); - if (ret == 0) - sk->sk_ipv6only = ssock->sk->sk_ipv6only; - - release_sock(sk); - break; - } - - return ret; -} - -static bool mptcp_unsupported(int level, int optname) -{ - if (level == SOL_IP) { - switch (optname) { - case IP_ADD_MEMBERSHIP: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - if (level == SOL_IPV6) { - switch (optname) { - case IPV6_ADDRFORM: - case IPV6_ADD_MEMBERSHIP: - case IPV6_DROP_MEMBERSHIP: - case IPV6_JOIN_ANYCAST: - case IPV6_LEAVE_ANYCAST: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - return false; -} - -static int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - if (mptcp_unsupported(level, optname)) - return -ENOPROTOOPT; - - if (level == SOL_SOCKET) - return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when TCP socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_setsockopt(ssk, level, optname, optval, optlen); - - if (level == SOL_IPV6) - return mptcp_setsockopt_v6(msk, optname, optval, optlen); - - return -EOPNOTSUPP; -} - -static int mptcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_getsockopt(ssk, level, optname, optval, option); - - return -EOPNOTSUPP; -} - void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) @@ -3375,7 +3246,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d8de1e961ab0..df269c26f145 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -108,6 +108,7 @@ #define MPTCP_CLEAN_UNA 7 #define MPTCP_ERROR_REPORT 8 #define MPTCP_RETRANSMIT 9 +#define MPTCP_WORK_SYNC_SETSOCKOPT 10 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -255,6 +256,8 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + + u32 setsockopt_seq; }; #define mptcp_lock_sock(___sk, cb) do { \ @@ -413,6 +416,8 @@ struct mptcp_subflow_context { long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ + u32 setsockopt_seq; + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; @@ -571,6 +576,11 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option); + void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); @@ -730,6 +740,12 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); +void mptcp_sockopt_sync_all(struct mptcp_sock *msk); + static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c new file mode 100644 index 000000000000..00d941b66c1e --- /dev/null +++ b/net/mptcp/sockopt.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2021, Red Hat. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <net/sock.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_check_fallback(msk))) + return NULL; + + return msk->first; +} + +static u32 sockopt_seq_reset(const struct sock *sk) +{ + sock_owned_by_me(sk); + + /* Highbits contain state. Allows to distinguish sockopt_seq + * of listener and established: + * s0 = new_listener() + * sockopt(s0) - seq is 1 + * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0) + * sockopt(s0) - seq increments to 2 on s0 + * sockopt(s1) // seq increments to 2 on s1 (different option) + * new ssk completes join, inherits options from s0 // seq 2 + * Needs sync from mptcp join logic, but ssk->seq == msk->seq + * + * Set High order bits to sk_state so ssk->seq == msk->seq test + * will fail. + */ + + return (u32)sk->sk_state << 24u; +} + +static void sockopt_seq_inc(struct mptcp_sock *msk) +{ + u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff; + + msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq; +} + +static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen, int *val) +{ + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(val, optval, sizeof(*val))) + return -EFAULT; + + return 0; +} + +static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + sockopt_seq_inc(msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_DEBUG: + sock_valbool_flag(ssk, SOCK_DBG, !!val); + break; + case SO_KEEPALIVE: + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, !!val); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val); + break; + case SO_PRIORITY: + ssk->sk_priority = val; + break; + case SO_SNDBUF: + case SO_SNDBUFFORCE: + ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + break; + case SO_RCVBUF: + case SO_RCVBUFFORCE: + ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + break; + case SO_MARK: + if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + break; + case SO_INCOMING_CPU: + WRITE_ONCE(ssk->sk_incoming_cpu, val); + break; + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); +} + +static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; +} + +static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) +{ + struct sock *sk = (struct sock *)msk; + + WRITE_ONCE(sk->sk_incoming_cpu, val); + + mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); +} + +static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + int val, ret; + + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + + switch (optname) { + case SO_KEEPALIVE: + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; + case SO_DEBUG: + case SO_MARK: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + return mptcp_sol_socket_intval(msk, optname, val); + case SO_INCOMING_CPU: + mptcp_so_incoming_cpu(msk, val); + return 0; + } + + return -ENOPROTOOPT; +} + +static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct linger ling; + sockptr_t kopt; + int ret; + + if (optlen < sizeof(ling)) + return -EINVAL; + + if (copy_from_sockptr(&ling, optval, sizeof(ling))) + return -EFAULT; + + kopt = KERNEL_SOCKPTR(&ling); + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling)); + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + if (!ling.l_onoff) { + sock_reset_flag(ssk, SOCK_LINGER); + } else { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + case SO_BINDTODEVICE: + case SO_BINDTOIFINDEX: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + else if (optname == SO_BINDTODEVICE) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + else if (optname == SO_BINDTOIFINDEX) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + } + release_sock(sk); + return ret; + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_DEBUG: + return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); + case SO_LINGER: + return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return 0; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + +static bool mptcp_supported_sockopt(int level, int optname) +{ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_DEBUG: + case SO_REUSEPORT: + case SO_REUSEADDR: + + /* the following ones need a better implementation, + * but are quite common we want to preserve them + */ + case SO_BINDTODEVICE: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_LINGER: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_BINDTOIFINDEX: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + + /* next ones are no-op for plain TCP */ + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return true; + } + + /* SO_OOBINLINE is not supported, let's avoid the related mess */ + /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + */ + /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + */ + /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ + /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ + /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + */ + /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ + /* SO_TXTIME is currently unsupported */ + return false; + } + if (level == SOL_IP) { + switch (optname) { + /* should work fine */ + case IP_FREEBIND: + case IP_TRANSPARENT: + + /* the following are control cmsg related */ + case IP_PKTINFO: + case IP_RECVTTL: + case IP_RECVTOS: + case IP_RECVOPTS: + case IP_RETOPTS: + case IP_PASSSEC: + case IP_RECVORIGDSTADDR: + case IP_CHECKSUM: + case IP_RECVFRAGSIZE: + + /* common stuff that need some love */ + case IP_TOS: + case IP_TTL: + case IP_BIND_ADDRESS_NO_PORT: + case IP_MTU_DISCOVER: + case IP_RECVERR: + + /* possibly less common may deserve some love */ + case IP_MINTTL: + + /* the following is apparently a no-op for plain TCP */ + case IP_RECVERR_RFC4884: + return true; + } + + /* IP_OPTIONS is not supported, needs subflow care */ + /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */ + /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF, + * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP, + * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE, + * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP, + * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, + * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal + * with mcast stuff + */ + /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */ + return false; + } + if (level == SOL_IPV6) { + switch (optname) { + case IPV6_V6ONLY: + + /* the following are control cmsg related */ + case IPV6_RECVPKTINFO: + case IPV6_2292PKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_2292HOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_2292RTHDR: + case IPV6_RECVHOPOPTS: + case IPV6_2292HOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_2292DSTOPTS: + case IPV6_RECVTCLASS: + case IPV6_FLOWINFO: + case IPV6_RECVPATHMTU: + case IPV6_RECVORIGDSTADDR: + case IPV6_RECVFRAGSIZE: + + /* the following ones need some love but are quite common */ + case IPV6_TCLASS: + case IPV6_TRANSPARENT: + case IPV6_FREEBIND: + case IPV6_PKTINFO: + case IPV6_2292PKTOPTIONS: + case IPV6_UNICAST_HOPS: + case IPV6_MTU_DISCOVER: + case IPV6_MTU: + case IPV6_RECVERR: + case IPV6_FLOWINFO_SEND: + case IPV6_FLOWLABEL_MGR: + case IPV6_MINHOPCOUNT: + case IPV6_DONTFRAG: + case IPV6_AUTOFLOWLABEL: + + /* the following one is a no-op for plain TCP */ + case IPV6_RECVERR_RFC4884: + return true; + } + + /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are + * not supported + */ + /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF, + * IPV6_MULTICAST_IF, IPV6_ADDRFORM, + * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST, + * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP, + * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP, + * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER + * are not supported better not deal with mcast + */ + /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */ + + /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */ + /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */ + return false; + } + if (level == SOL_TCP) { + switch (optname) { + /* the following are no-op or should work just fine */ + case TCP_THIN_DUPACK: + case TCP_DEFER_ACCEPT: + + /* the following need some love */ + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_CONGESTION: + case TCP_ULP: + case TCP_CORK: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_SAVE_SYN: + case TCP_LINGER2: + case TCP_WINDOW_CLAMP: + case TCP_QUICKACK: + case TCP_USER_TIMEOUT: + case TCP_TIMESTAMP: + case TCP_NOTSENT_LOWAT: + case TCP_TX_DELAY: + return true; + } + + /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */ + + /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, + * TCP_REPAIR_WINDOW are not supported, better avoid this mess + */ + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE, + * are not supported fastopen is currently unsupported + */ + /* TCP_INQ is currently unsupported, needs some recvmsg work */ + } + return false; +} + +static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + char name[TCP_CA_NAME_MAX]; + bool cap_net_admin; + int ret; + + if (optlen < 1) + return -EINVAL; + + ret = strncpy_from_sockptr(name, optval, + min_t(long, TCP_CA_NAME_MAX - 1, optlen)); + if (ret < 0) + return -EFAULT; + + name[ret] = 0; + + cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN); + + ret = 0; + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err; + + lock_sock(ssk); + err = tcp_set_congestion_control(ssk, name, true, cap_net_admin); + if (err < 0 && ret == 0) + ret = err; + subflow->setsockopt_seq = msk->setsockopt_seq; + release_sock(ssk); + } + + if (ret == 0) + tcp_set_congestion_control(sk, name, false, cap_net_admin); + + release_sock(sk); + return ret; +} + +static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + switch (optname) { + case TCP_ULP: + return -EOPNOTSUPP; + case TCP_CONGESTION: + return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); + } + + return -EOPNOTSUPP; +} + +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when TCP socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); + + if (level == SOL_TCP) + return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret = -EINVAL; + struct sock *ssk; + + lock_sock(sk); + ssk = msk->first; + if (ssk) { + ret = tcp_getsockopt(ssk, level, optname, optval, optlen); + goto out; + } + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto out; + + ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); + +out: + release_sock(sk); + return ret; +} + +static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + switch (optname) { + case TCP_ULP: + case TCP_CONGESTION: + case TCP_INFO: + case TCP_CC_INFO: + return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); + } + return -EOPNOTSUPP; +} + +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); + + if (level == SOL_TCP) + return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); + return -EOPNOTSUPP; +} + +static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) +{ + static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; + struct sock *sk = (struct sock *)msk; + + if (ssk->sk_prot->keepalive) { + if (sock_flag(sk, SOCK_KEEPOPEN)) + ssk->sk_prot->keepalive(ssk, 1); + else + ssk->sk_prot->keepalive(ssk, 0); + } + + ssk->sk_priority = sk->sk_priority; + ssk->sk_bound_dev_if = sk->sk_bound_dev_if; + ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + + if (sk->sk_userlocks & tx_rx_locks) { + ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + } + + if (sock_flag(sk, SOCK_LINGER)) { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } else { + sock_reset_flag(ssk, SOCK_LINGER); + } + + if (sk->sk_mark != ssk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + + sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG)); + + if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) + tcp_set_congestion_control(ssk, inet_csk(sk)->icsk_ca_ops->name, false, true); +} + +static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + bool slow = lock_sock_fast(ssk); + + sync_socket_options(msk, ssk); + + unlock_sock_fast(ssk, slow); +} + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + msk_owned_by_me(msk); + + if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + + subflow->setsockopt_seq = msk->setsockopt_seq; + } +} + +void mptcp_sockopt_sync_all(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 seq; + + seq = sockopt_seq_reset(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + + if (sseq != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + WRITE_ONCE(subflow->setsockopt_seq, seq); + } else if (sseq != seq) { + WRITE_ONCE(subflow->setsockopt_seq, seq); + } + + cond_resched(); + } + + msk->setsockopt_seq = seq; +} diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3c19a5265a0f..c3da84576b3c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -679,6 +679,9 @@ create_child: goto out; } + /* ssk inherits options of listener sk */ + ctx->setsockopt_seq = listener->setsockopt_seq; + if (ctx->mp_capable) { /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space @@ -694,6 +697,7 @@ create_child: * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; + mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; @@ -1317,6 +1321,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_info2sockaddr(remote, &addr, ssk->sk_family); mptcp_add_pending_subflow(msk, subflow); + mptcp_sockopt_sync(msk, ssk); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 00bb158b4a5d..f1464f09b080 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -6,7 +6,7 @@ KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ - simult_flows.sh + simult_flows.sh mptcp_sockopt.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 69d89b5d666f..2f207cf33661 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -57,6 +57,7 @@ static bool cfg_join; static bool cfg_remove; static unsigned int cfg_do_w; static int cfg_wait; +static uint32_t cfg_mark; static void die_usage(void) { @@ -69,6 +70,7 @@ static void die_usage(void) fprintf(stderr, "\t-p num -- use port num\n"); fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); + fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); exit(1); @@ -140,6 +142,17 @@ static void set_sndbuf(int fd, unsigned int size) } } +static void set_mark(int fd, uint32_t mark) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); + if (err) { + perror("set SO_MARK"); + exit(1); + } +} + static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { @@ -248,6 +261,9 @@ static int sock_connect_mptcp(const char * const remoteaddr, continue; } + if (cfg_mark) + set_mark(sock, cfg_mark); + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) break; /* success */ @@ -830,7 +846,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -880,6 +896,9 @@ static void parse_opts(int argc, char **argv) case 'w': cfg_wait = atoi(optarg)*1000000; break; + case 'M': + cfg_mark = strtol(optarg, NULL, 0); + break; } } @@ -911,6 +930,8 @@ int main(int argc, char *argv[]) set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); + if (cfg_mark) + set_mark(fd, cfg_mark); return main_loop_s(fd); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh new file mode 100755 index 000000000000..2fa13946ac04 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -0,0 +1,276 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) +mptcp_connect="" +do_all_tests=1 + +add_mark_rules() +{ + local ns=$1 + local m=$2 + + for t in iptables ip6tables; do + # just to debug: check we have multiple subflows connection requests + ip netns exec $ns $t -A OUTPUT -p tcp --syn -m mark --mark $m -j ACCEPT + + # RST packets might be handled by a internal dummy socket + ip netns exec $ns $t -A OUTPUT -p tcp --tcp-flags RST RST -m mark --mark 0 -j ACCEPT + + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark $m -j ACCEPT + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark 0 -j DROP + done +} + +init() +{ + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + + ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + + ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + done + + ip netns exec $ns1 ./pm_nl_ctl limits 8 8 + ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + + add_mark_rules $ns1 1 + add_mark_rules $ns2 2 +} + +cleanup() +{ + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +iptables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without iptables tool" + exit $ksft_skip +fi + +ip6tables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without ip6tables tool" + exit $ksft_skip +fi + +check_mark() +{ + local ns=$1 + local af=$2 + + tables=iptables + + if [ $af -eq 6 ];then + tables=ip6tables + fi + + counters=$(ip netns exec $ns $tables -v -L OUTPUT | grep DROP) + values=${counters%DROP*} + + for v in $values; do + if [ $v -ne 0 ]; then + echo "FAIL: got $tables $values in ns $ns , not 0 - not all expected packets marked" 1>&2 + return 1 + fi + done + + return 0 +} + +print_file_err() +{ + ls -l "$1" 1>&2 + echo "Trailing bytes are: " + tail -c 27 "$1" +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + ret=1 + + return 1 + fi + + return 0 +} + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=12001 + + :> "$cout" + :> "$sout" + + mptcp_connect="./mptcp_connect -r 20" + + local local_addr + if is_v6 "${connect_addr}"; then + local_addr="::" + else + local_addr="0.0.0.0" + fi + + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & + spid=$! + + sleep 1 + + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & + + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + + echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + + ret=1 + return 1 + fi + + if [ $local_addr = "::" ];then + check_mark $listener_ns 6 + check_mark $connector_ns 6 + else + check_mark $listener_ns 4 + check_mark $connector_ns 4 + fi + + check_transfer $cin $sout "file received by server" + + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + return 0 + fi + + return 1 +} + +make_file() +{ + name=$1 + who=$2 + size=$3 + + dd if=/dev/urandom of="$name" bs=1024 count=$size 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $size KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + + lret=$? + + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" 1 +make_file "$sin" "server" 1 +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +run_tests $ns1 $ns2 dead:beef:1::1 + + +if [ $ret -eq 0 ];then + echo "PASS: all packets had packet mark set" +fi + +exit $ret |