summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2020-12-14 17:30:08 -0800
committerJakub Kicinski <kuba@kernel.org>2020-12-14 17:30:09 -0800
commitebf322822cc93259316480266afeb889dd65522a (patch)
tree32ff1828e3dc23a2dadba291d8ad3ba41f131722
parentefc36d3c344a36fe73cda93c55763058629e0f2c (diff)
parent15e6ca974b14c2dc4221738ef81b23ef694c9160 (diff)
Merge branch 'mptcp-another-set-of-miscellaneous-mptcp-fixes'
Mat Martineau says: ==================== mptcp: Another set of miscellaneous MPTCP fixes This is another collection of MPTCP fixes and enhancements that we have tested in the MPTCP tree: Patch 1 cleans up cgroup attachment for in-kernel subflow sockets. Patches 2 and 3 make sure that deletion of advertised addresses by an MPTCP path manager when flushing all addresses behaves similarly to the remove-single-address operation, and adds related tests. Patches 4 and 8 do some minor cleanup. Patches 5-7 add MPTCP_FASTCLOSE functionality. Note that patch 6 adds MPTCP option parsing to tcp_reset(). Patch 9 optimizes skb size for outgoing MPTCP packets. ==================== Link: https://lore.kernel.org/r/20201210222506.222251-1-mathew.j.martineau@linux.intel.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/tcp_input.c13
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/mptcp/options.c17
-rw-r--r--net/mptcp/pm_netlink.c21
-rw-r--r--net/mptcp/protocol.c47
-rw-r--r--net/mptcp/protocol.h6
-rw-r--r--net/mptcp/subflow.c34
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh50
9 files changed, 155 insertions, 37 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a62fb7f8a1e3..b1a05f8b35f0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -611,7 +611,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
-void tcp_reset(struct sock *sk);
+void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6ad3b5c38e7..48ee476aa031 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4218,10 +4218,13 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
}
/* When we get a reset we do this. */
-void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
trace_tcp_receive_reset(sk);
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
+
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
@@ -5604,7 +5607,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
- tcp_reset(sk);
+ tcp_reset(sk, skb);
}
goto discard;
}
@@ -5640,7 +5643,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
if (rst_seq_match)
- tcp_reset(sk);
+ tcp_reset(sk, skb);
else {
/* Disable TFO if RST is out-of-order
* and no data has been received
@@ -6077,7 +6080,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (th->rst) {
- tcp_reset(sk);
+ tcp_reset(sk, skb);
goto discard;
}
@@ -6519,7 +6522,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- tcp_reset(sk);
+ tcp_reset(sk, skb);
return 1;
}
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 495dda2449fe..0055ae0a3bf8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -801,7 +801,7 @@ embryonic_reset:
req->rsk_ops->send_reset(sk, skb);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
- tcp_reset(sk);
+ tcp_reset(sk, skb);
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 1ca60d9da3ef..5e7d7755d1a6 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -282,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb,
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
+ case MPTCPOPT_MP_FASTCLOSE:
+ if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
+ break;
+
+ ptr += 2;
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->fastclose = 1;
+ break;
+
default:
break;
}
@@ -299,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->mp_join = 0;
mp_opt->add_addr = 0;
mp_opt->ahmac = 0;
+ mp_opt->fastclose = 0;
mp_opt->port = 0;
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
@@ -942,6 +953,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
+ if (mp_opt.fastclose &&
+ msk->local_key == mp_opt.rcvr_key) {
+ WRITE_ONCE(msk->rcv_fastclose, true);
+ mptcp_schedule_work((struct sock *)msk);
+ }
+
if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
struct mptcp_addr_info addr;
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 5151cfcd6962..a6d983d80576 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry, *ret = NULL;
rcu_read_lock();
- spin_lock_bh(&msk->join_list_lock);
+ __mptcp_flush_join_list(msk);
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
@@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet,
* pending join
*/
if (entry->addr.family == ((struct sock *)msk)->sk_family &&
- !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
- !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
+ !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry;
break;
}
}
- spin_unlock_bh(&msk->join_list_lock);
rcu_read_unlock();
return ret;
}
@@ -867,13 +865,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
return ret;
}
-static void __flush_addrs(struct pm_nl_pernet *pernet)
+static void __flush_addrs(struct net *net, struct list_head *list)
{
- while (!list_empty(&pernet->local_addr_list)) {
+ while (!list_empty(list)) {
struct mptcp_pm_addr_entry *cur;
- cur = list_entry(pernet->local_addr_list.next,
+ cur = list_entry(list->next,
struct mptcp_pm_addr_entry, list);
+ mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
kfree_rcu(cur, rcu);
}
@@ -890,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet)
static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
- __flush_addrs(pernet);
+ list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
spin_unlock_bh(&pernet->lock);
+ __flush_addrs(sock_net(skb->sk), &free_list);
return 0;
}
@@ -1156,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
+ struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+
/* net is removed from namespace list, can't race with
* other modifiers
*/
- __flush_addrs(net_generic(net, pm_nl_pernet_id));
+ __flush_addrs(net, &pernet->local_addr_list);
}
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2540d82742ac..b812aaae8044 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1256,6 +1256,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail;
bool can_collapse = false;
+ int size_bias = 0;
int avail_size;
size_t ret = 0;
@@ -1277,10 +1278,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
can_collapse = (info->size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(data_seq, skb, mpext);
- if (!can_collapse)
+ if (!can_collapse) {
TCP_SKB_CB(skb)->eor = 1;
- else
+ } else {
+ size_bias = skb->len;
avail_size = info->size_goal - skb->len;
+ }
}
/* Zero window and all data acked? Probe. */
@@ -1300,8 +1303,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return 0;
ret = info->limit - info->sent;
- tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
- dfrag->offset + info->sent, &ret);
+ tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
+ dfrag->page, dfrag->offset + info->sent, &ret);
if (!tail) {
tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
return -ENOMEM;
@@ -1310,8 +1313,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* if the tail skb is still the cached one, collapsing really happened.
*/
if (skb == tail) {
- WARN_ON_ONCE(!can_collapse);
+ TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += ret;
+ WARN_ON_ONCE(!can_collapse);
WARN_ON_ONCE(zero_window_probe);
goto out;
}
@@ -2217,6 +2221,36 @@ static bool mptcp_check_close_timeout(const struct sock *sk)
return true;
}
+static void mptcp_check_fastclose(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = &msk->sk.icsk_inet.sk;
+
+ if (likely(!READ_ONCE(msk->rcv_fastclose)))
+ return;
+
+ mptcp_token_destroy(msk);
+
+ list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(tcp_sk);
+ if (tcp_sk->sk_state != TCP_CLOSE) {
+ tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+ tcp_set_state(tcp_sk, TCP_CLOSE);
+ }
+ release_sock(tcp_sk);
+ }
+
+ inet_sk_state_store(sk, TCP_CLOSE);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
+
+ mptcp_close_wake_up(sk);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -2233,6 +2267,9 @@ static void mptcp_worker(struct work_struct *work)
mptcp_check_data_fin_ack(sk);
__mptcp_flush_join_list(msk);
+
+ mptcp_check_fastclose(msk);
+
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(msk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index f6c3c686a34a..7cf9d110b85f 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -23,6 +23,7 @@
#define OPTION_MPTCP_ADD_ADDR BIT(6)
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
+#define OPTION_MPTCP_FASTCLOSE BIT(9)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -58,6 +59,7 @@
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24
#define TCPOLEN_MPTCP_PORT_LEN 4
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+#define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
@@ -110,6 +112,7 @@ struct mptcp_options_received {
u16 data_len;
u16 mp_capable : 1,
mp_join : 1,
+ fastclose : 1,
dss : 1,
add_addr : 1,
rm_addr : 1,
@@ -119,7 +122,7 @@ struct mptcp_options_received {
u32 token;
u32 nonce;
u64 thmac;
- u8 hmac[20];
+ u8 hmac[MPTCPOPT_HMAC_LEN];
u8 join_id;
u8 use_map:1,
dsn64:1,
@@ -237,6 +240,7 @@ struct mptcp_sock {
bool fully_established;
bool rcv_data_fin;
bool snd_data_fin_enable;
+ bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
spinlock_t join_list_lock;
struct sock *ack_hint;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index fefcaf497938..73e66a406d99 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -313,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
+ /* must hold: tcp_done() could drop last reference on parent */
+ sock_hold(sk);
+
tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
schedule_work(&mptcp_sk(sk)->work))
- sock_hold(sk);
+ return; /* worker will put sk for us */
+
+ sock_put(sk);
}
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
@@ -1167,6 +1172,30 @@ failed:
return err;
}
+static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
+ *child_skcd = &child->sk_cgrp_data;
+
+ /* only the additional subflows created by kworkers have to be modified */
+ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
+ cgroup_id(sock_cgroup_ptr(child_skcd))) {
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = parent->sk_memcg;
+
+ mem_cgroup_sk_free(child);
+ if (memcg && css_tryget(&memcg->css))
+ child->sk_memcg = memcg;
+#endif /* CONFIG_MEMCG */
+
+ cgroup_sk_free(child_skcd);
+ *child_skcd = *parent_skcd;
+ cgroup_sk_clone(child_skcd);
+ }
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+}
+
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -1187,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
lock_sock(sf->sk);
+ /* the newly created socket has to be in the same cgroup as its parent */
+ mptcp_attach_cgroup(sk, sf->sk);
+
/* kernel sockets do not by default acquire net ref, but TCP timer
* needs it.
*/
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 0eae628d1ffd..9aa9624cff97 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -264,27 +264,37 @@ do_transfer()
cpid=$!
if [ $rm_nr_ns1 -gt 0 ]; then
- counter=1
- sleep 1
+ if [ $rm_nr_ns1 -lt 8 ]; then
+ counter=1
+ sleep 1
- while [ $counter -le $rm_nr_ns1 ]
- do
- ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ while [ $counter -le $rm_nr_ns1 ]
+ do
+ ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ else
sleep 1
- let counter+=1
- done
+ ip netns exec ${listener_ns} ./pm_nl_ctl flush
+ fi
fi
if [ $rm_nr_ns2 -gt 0 ]; then
- counter=1
- sleep 1
+ if [ $rm_nr_ns2 -lt 8 ]; then
+ counter=1
+ sleep 1
- while [ $counter -le $rm_nr_ns2 ]
- do
- ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ while [ $counter -le $rm_nr_ns2 ]
+ do
+ ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ else
sleep 1
- let counter+=1
- done
+ ip netns exec ${connector_ns} ./pm_nl_ctl flush
+ fi
fi
wait $cpid
@@ -663,6 +673,18 @@ chk_join_nr "remove subflows and signal" 3 3 3
chk_add_nr 1 1
chk_rm_nr 2 2
+# subflows and signal, flush
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1 0 8 8 slow
+chk_join_nr "flush subflows and signal" 3 3 3
+chk_add_nr 1 1
+chk_rm_nr 2 2
+
# subflow IPv6
reset
ip netns exec $ns1 ./pm_nl_ctl limits 0 1