diff options
author | Jakub Kicinski <kuba@kernel.org> | 2020-12-14 17:30:08 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2020-12-14 17:30:09 -0800 |
commit | ebf322822cc93259316480266afeb889dd65522a (patch) | |
tree | 32ff1828e3dc23a2dadba291d8ad3ba41f131722 | |
parent | efc36d3c344a36fe73cda93c55763058629e0f2c (diff) | |
parent | 15e6ca974b14c2dc4221738ef81b23ef694c9160 (diff) |
Merge branch 'mptcp-another-set-of-miscellaneous-mptcp-fixes'
Mat Martineau says:
====================
mptcp: Another set of miscellaneous MPTCP fixes
This is another collection of MPTCP fixes and enhancements that we have
tested in the MPTCP tree:
Patch 1 cleans up cgroup attachment for in-kernel subflow sockets.
Patches 2 and 3 make sure that deletion of advertised addresses by an
MPTCP path manager when flushing all addresses behaves similarly to the
remove-single-address operation, and adds related tests.
Patches 4 and 8 do some minor cleanup.
Patches 5-7 add MPTCP_FASTCLOSE functionality. Note that patch 6 adds MPTCP
option parsing to tcp_reset().
Patch 9 optimizes skb size for outgoing MPTCP packets.
====================
Link: https://lore.kernel.org/r/20201210222506.222251-1-mathew.j.martineau@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | include/net/tcp.h | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
-rw-r--r-- | net/mptcp/options.c | 17 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 21 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 47 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 6 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 34 | ||||
-rwxr-xr-x | tools/testing/selftests/net/mptcp/mptcp_join.sh | 50 |
9 files changed, 155 insertions, 37 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index a62fb7f8a1e3..b1a05f8b35f0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -611,7 +611,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); -void tcp_reset(struct sock *sk); +void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6ad3b5c38e7..48ee476aa031 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk, struct sk_buff *skb) { trace_tcp_receive_reset(sk); + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, &tp->last_oow_ack_time)) tcp_send_dupack(sk, skb); } else if (tcp_reset_check(sk, skb)) { - tcp_reset(sk); + tcp_reset(sk, skb); } goto discard; } @@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } if (rst_seq_match) - tcp_reset(sk); + tcp_reset(sk, skb); else { /* Disable TFO if RST is out-of-order * and no data has been received @@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->rst) { - tcp_reset(sk); + tcp_reset(sk, skb); goto discard; } @@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - tcp_reset(sk); + tcp_reset(sk, skb); return 1; } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..0055ae0a3bf8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -801,7 +801,7 @@ embryonic_reset: req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); - tcp_reset(sk); + tcp_reset(sk, skb); } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ca60d9da3ef..5e7d7755d1a6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5151cfcd6962..a6d983d80576 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet, * pending join */ if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } @@ -867,13 +865,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); kfree_rcu(cur, rcu); } @@ -890,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet) static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -1156,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2540d82742ac..b812aaae8044 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; bool can_collapse = false; + int size_bias = 0; int avail_size; size_t ret = 0; @@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext = skb_ext_find(skb, SKB_EXT_MPTCP); can_collapse = (info->size_goal - skb->len > 0) && mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else + } else { + size_bias = skb->len; avail_size = info->size_goal - skb->len; + } } /* Zero window and all data acked? Probe. */ @@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return 0; ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page, - dfrag->offset + info->sent, &ret); + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); if (!tail) { tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); return -ENOMEM; @@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, /* if the tail skb is still the cached one, collapsing really happened. */ if (skb == tail) { - WARN_ON_ONCE(!can_collapse); + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; } @@ -2217,6 +2221,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk) return true; } +static void mptcp_check_fastclose(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; + + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; + + mptcp_token_destroy(msk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); + } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2233,6 +2267,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + + mptcp_check_fastclose(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6c3c686a34a..7cf9d110b85f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -58,6 +59,7 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 #define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -110,6 +112,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -119,7 +122,7 @@ struct mptcp_options_received { u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, @@ -237,6 +240,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct sock *ack_hint; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fefcaf497938..73e66a406d99 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) @@ -1167,6 +1172,30 @@ failed: return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1187,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 0eae628d1ffd..9aa9624cff97 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -264,27 +264,37 @@ do_transfer() cpid=$! if [ $rm_nr_ns1 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns1 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${listener_ns} ./pm_nl_ctl flush + fi fi if [ $rm_nr_ns2 -gt 0 ]; then - counter=1 - sleep 1 + if [ $rm_nr_ns2 -lt 8 ]; then + counter=1 + sleep 1 - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + sleep 1 + let counter+=1 + done + else sleep 1 - let counter+=1 - done + ip netns exec ${connector_ns} ./pm_nl_ctl flush + fi fi wait $cpid @@ -663,6 +673,18 @@ chk_join_nr "remove subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 +# subflows and signal, flush +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow +chk_join_nr "flush subflows and signal" 3 3 3 +chk_add_nr 1 1 +chk_rm_nr 2 2 + # subflow IPv6 reset ip netns exec $ns1 ./pm_nl_ctl limits 0 1 |