diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/icmp.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 18 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 1 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 1 | ||||
-rw-r--r-- | net/ipv4/ping.c | 3 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 17 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 21 | ||||
-rw-r--r-- | net/ipv4/udp.c | 17 |
13 files changed, 85 insertions, 36 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 956a806649f7..e30515f89802 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; - sk->sk_mark = mark; + ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -710,10 +710,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; - sk->sk_mark = mark; ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; + ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 090d3097ee15..17206677d503 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1702,7 +1702,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; - sk->sk_mark = fl4.flowi4_mark; + ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 181b7a2a0247..f8b419e2475c 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -844,3 +844,21 @@ void ip_tunnel_unneed_metadata(void) static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); + +/* Returns either the correct skb->protocol value, or 0 if invalid. */ +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb) +{ + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && + ip_hdr(skb)->version == 4) + return htons(ETH_P_IP); + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && + ipv6_hdr(skb)->version == 6) + return htons(ETH_P_IPV6); + return 0; +} +EXPORT_SYMBOL(ip_tunnel_parse_protocol); + +const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; +EXPORT_SYMBOL(ip_tunnel_header_ops); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1d9c8cff5ac3..460ca1099e8a 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -441,6 +441,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 40fea52c8277..75d35e76bec2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -361,6 +361,7 @@ static const struct net_device_ops ipip_netdev_ops = { static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 535427292194..df6fbefe44d4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -786,6 +786,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = user_icmph.type; + fl4.fl4_icmp_code = user_icmph.code; + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1d7076b78e63..a01efa062f6b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2027,7 +2027,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, const struct sk_buff *hint) { struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rt = (struct rtable *)hint; + struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); int err = -EINVAL; u32 tag = 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 810cc164f795..6f0caf9a866d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2691,6 +2691,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3246,10 +3249,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: @@ -4033,11 +4033,14 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); - return crypto_ahash_update(hp->md5_req); + sg_init_one(&sg, key->key, keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); + + /* We use data_race() because tcp_md5_do_add() might change key->key under us */ + return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3172e31987be..62878cf26d9c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -197,7 +197,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3a0eb139b76..518f04355fbf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3488,10 +3488,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } } -/* This routine deals with acks during a TLP episode. - * We mark the end of a TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. +/* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -3500,7 +3498,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (before(ack, tp->tlp_high_seq)) return; - if (flag & FLAG_DSACKING_ACK) { + if (!tp->tlp_retrans) { + /* TLP of new data has been acknowledged */ + tp->tlp_high_seq = 0; + } else if (flag & FLAG_DSACKING_ACK) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -4582,6 +4583,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); + sk->sk_data_ready(sk); tcp_drop(sk, skb); return; } @@ -4828,6 +4830,7 @@ queue_and_out: sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + sk->sk_data_ready(sk); goto drop; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad6435ba6d72..04bfcbbfee83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1111,9 +1111,21 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { - /* Pre-existing entry - just update that one. */ - memcpy(key->key, newkey, newkeylen); - key->keylen = newkeylen; + /* Pre-existing entry - just update that one. + * Note that the key might be used concurrently. + * data_race() is telling kcsan that we do not care of + * key mismatches, since changing MD5 key on live flows + * can lead to packet drops. + */ + data_race(memcpy(key->key, newkey, newkeylen)); + + /* Pairs with READ_ONCE() in tcp_md5_hash_key(). + * Also note that a reader could catch new key->keylen value + * but old key->key[], this is the reason we use __GFP_ZERO + * at sock_kmalloc() time below these lines. + */ + WRITE_ONCE(key->keylen, newkeylen); + return 0; } @@ -1129,7 +1141,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, rcu_assign_pointer(tp->md5sig_info, md5sig); } - key = sock_kmalloc(sk, sizeof(*key), gfp); + key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; if (!tcp_alloc_md5sig_pool()) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a50e1990a845..0bc05d68cd74 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -700,7 +700,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -715,7 +716,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ - ireq->tstamp_ok &= !ireq->sack_ok; + if (synack_type != TCP_SYNACK_COOKIE) + ireq->tstamp_ok &= !ireq->sack_ok; } #endif @@ -2622,6 +2624,11 @@ void tcp_send_loss_probe(struct sock *sk) int pcount; int mss = tcp_current_mss(sk); + /* At most one outstanding TLP */ + if (tp->tlp_high_seq) + goto rearm_timer; + + tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; @@ -2639,10 +2646,6 @@ void tcp_send_loss_probe(struct sock *sk) return; } - /* At most one outstanding TLP retransmission. */ - if (tp->tlp_high_seq) - goto rearm_timer; - if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; @@ -2664,10 +2667,12 @@ void tcp_send_loss_probe(struct sock *sk) if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; + tp->tlp_retrans = 1; + +probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; -probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; @@ -3394,7 +3399,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + foc, synack_type) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b7ebbcae497..4077d589b72e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -416,7 +416,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *sk, *result, *reuseport_result; int score, badness; u32 hash = 0; @@ -426,17 +426,20 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { + reuseport_result = NULL; + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (result && !reuseport_has_conns(sk, false)) - return result; + reuseport_result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (reuseport_result && !reuseport_has_conns(sk, false)) + return reuseport_result; } + + result = reuseport_result ? : sk; badness = score; - result = sk; } } return result; @@ -2051,7 +2054,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are |