diff options
Diffstat (limited to 'net/ipv4')
39 files changed, 593 insertions, 161 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2f94d221c00e..54648181dd56 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -318,7 +318,7 @@ lookup_protocol: WARN_ON(!answer_prot->slab); - err = -ENOBUFS; + err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; @@ -1720,7 +1720,6 @@ EXPORT_SYMBOL_GPL(snmp_fold_field64); #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, - .netns_ok = 1, }; #endif @@ -1733,7 +1732,6 @@ static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, - .netns_ok = 1, .icmp_strict_tag_validation = 1, }; @@ -1746,14 +1744,12 @@ static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, - .netns_ok = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index fab0958c41be..6eea1e9e998d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -555,7 +555,6 @@ static int ah4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type ah_type = { - .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e0480c6cebaa..099259fc826a 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -187,8 +187,7 @@ static int __init cipso_v4_cache_init(void) * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: - * Invalidates and frees any entries in the CIPSO cache. Returns zero on - * success and negative values on failure. + * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 1c6429c353a9..73721a4448bd 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1955,7 +1955,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rcu(dev)) + if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, @@ -1981,7 +1981,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { - struct in_device *in_dev = __in_dev_get_rcu(dev); + struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8e3b445a8c21..a09e36c4a413 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -673,7 +673,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -1199,7 +1199,6 @@ static int esp4_rcv_cb(struct sk_buff *skb, int err) static const struct xfrm_type esp_type = { - .description = "ESP4", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 33687cf58286..8e4e9aa12130 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -33,12 +33,11 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, struct xfrm_state *x; __be32 seq; __be32 spi; - int err; if (!pskb_pull(skb, offset)) return NULL; - if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); @@ -343,7 +342,6 @@ static const struct net_offload esp4_offload = { }; static const struct xfrm_type_offload esp_type_offload = { - .description = "ESP4 OFFLOAD", .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..a933bd6345b1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -371,6 +371,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; + } else { + swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) @@ -1122,10 +1124,8 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); - /* Add network specific broadcasts, when it takes a sense */ + /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, - prim, 0); fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); } @@ -1516,6 +1516,12 @@ static int __net_init ip_fib_net_init(struct net *net) if (err) return err; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* Default to 3-tuple */ + net->ipv4.sysctl_fib_multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; +#endif + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b58db1ca4bfb..e184bcb19943 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,7 +25,7 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 -/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +/* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5d1e6fe9d838..cbb2b4bb0dfa 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -195,7 +195,6 @@ static int gre_err(struct sk_buff *skb, u32 info) static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, - .netns_ok = 1, }; static int __init gre_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 752e392083e6..c695d294a5df 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -993,14 +993,8 @@ static bool icmp_redirect(struct sk_buff *skb) static bool icmp_echo(struct sk_buff *skb) { - struct icmp_ext_hdr *ext_hdr, _ext_hdr; - struct icmp_ext_echo_iio *iio, _iio; struct icmp_bxm icmp_param; - struct net_device *dev; - char buff[IFNAMSIZ]; struct net *net; - u16 ident_len; - u8 status; net = dev_net(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ @@ -1013,20 +1007,46 @@ static bool icmp_echo(struct sk_buff *skb) icmp_param.data_len = skb->len; icmp_param.head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) { + if (icmp_param.data.icmph.type == ICMP_ECHO) icmp_param.data.icmph.type = ICMP_ECHOREPLY; - goto send_reply; - } - if (!net->ipv4.sysctl_icmp_echo_enable_probe) + else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) return true; + + icmp_reply(&icmp_param, skb); + return true; +} + +/* Helper for icmp_echo and icmpv6_echo_reply. + * Searches for net_device that matches PROBE interface identifier + * and builds PROBE reply message in icmphdr. + * + * Returns false if PROBE responses are disabled via sysctl + */ + +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) +{ + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct net *net = dev_net(skb->dev); + struct net_device *dev; + char buff[IFNAMSIZ]; + u16 ident_len; + u8 status; + + if (!net->ipv4.sysctl_icmp_echo_enable_probe) + return false; + /* We currently only support probing interfaces on the proxy node * Check to ensure L-bit is set */ - if (!(ntohs(icmp_param.data.icmph.un.echo.sequence) & 1)) - return true; + if (!(ntohs(icmphdr->un.echo.sequence) & 1)) + return false; /* Clear status bits in reply message */ - icmp_param.data.icmph.un.echo.sequence &= htons(0xFF00); - icmp_param.data.icmph.type = ICMP_EXT_ECHOREPLY; + icmphdr->un.echo.sequence &= htons(0xFF00); + if (icmphdr->type == ICMP_EXT_ECHO) + icmphdr->type = ICMP_EXT_ECHOREPLY; + else + icmphdr->type = ICMPV6_EXT_ECHO_REPLY; ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); /* Size of iio is class_type dependent. * Only check header here and assign length based on ctype in the switch statement @@ -1066,7 +1086,7 @@ static bool icmp_echo(struct sk_buff *skb) if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + sizeof(struct in_addr)) goto send_mal_query; - dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: @@ -1087,8 +1107,8 @@ static bool icmp_echo(struct sk_buff *skb) goto send_mal_query; } if (!dev) { - icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_NO_IF; + return true; } /* Fill bits in reply message */ if (dev->flags & IFF_UP) @@ -1098,14 +1118,13 @@ static bool icmp_echo(struct sk_buff *skb) if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) status |= ICMP_EXT_ECHOREPLY_IPV6; dev_put(dev); - icmp_param.data.icmph.un.echo.sequence |= htons(status); -send_reply: - icmp_reply(&icmp_param, skb); - return true; + icmphdr->un.echo.sequence |= htons(status); + return true; send_mal_query: - icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; - goto send_reply; + icmphdr->code = ICMP_EXT_CODE_MAL_QUERY; + return true; } +EXPORT_SYMBOL_GPL(icmp_build_probe); /* * Handle ICMP Timestamp requests. diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd472eae4f5c..754013fa393b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -135,10 +135,18 @@ static int inet_csk_bind_conflict(const struct sock *sk, bool relax, bool reuseport_ok) { struct sock *sk2; + bool reuseport_cb_ok; bool reuse = sk->sk_reuse; bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + rcu_read_lock(); + reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); + /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ + reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); + rcu_read_unlock(); + /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed @@ -156,14 +164,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, if ((!relax || (!reuseport_ok && reuseport && sk2->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && + reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(uid, sock_i_uid(sk2))))) && inet_rcv_saddr_equal(sk, sk2, true)) break; } else if (!reuseport_ok || !reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || + !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2)))) { if (inet_rcv_saddr_equal(sk, sk2, true)) @@ -687,6 +695,66 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock *inet_reqsk_clone(struct request_sock *req, + struct sock *sk) +{ + struct sock *req_sk, *nreq_sk; + struct request_sock *nreq; + + nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!nreq) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + + /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ + sock_put(sk); + return NULL; + } + + req_sk = req_to_sk(req); + nreq_sk = req_to_sk(nreq); + + memcpy(nreq_sk, req_sk, + offsetof(struct sock, sk_dontcopy_begin)); + memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + + sk_node_init(&nreq_sk->sk_node); + nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; +#ifdef CONFIG_XPS + nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; +#endif + nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; + + nreq->rsk_listener = sk; + + /* We need not acquire fastopenq->lock + * because the child socket is locked in inet_csk_listen_stop(). + */ + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) + rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); + + return nreq; +} + +static void reqsk_queue_migrated(struct request_sock_queue *queue, + const struct request_sock *req) +{ + if (req->num_timeout == 0) + atomic_inc(&queue->young); + atomic_inc(&queue->qlen); +} + +static void reqsk_migrate_reset(struct request_sock *req) +{ + req->saved_syn = NULL; +#if IS_ENABLED(CONFIG_IPV6) + inet_rsk(req)->ipv6_opt = NULL; + inet_rsk(req)->pktopts = NULL; +#else + inet_rsk(req)->ireq_opt = NULL; +#endif +} + /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { @@ -727,15 +795,39 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; - struct net *net = sock_net(sk_listener); - struct inet_connection_sock *icsk = inet_csk(sk_listener); - struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; - if (inet_sk_state_load(sk_listener) != TCP_LISTEN) - goto drop; + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { + struct sock *nsk; + + nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); + if (!nsk) + goto drop; + nreq = inet_reqsk_clone(req, nsk); + if (!nreq) + goto drop; + + /* The new timer for the cloned req can decrease the 2 + * by calling inet_csk_reqsk_queue_drop_and_put(), so + * hold another count to prevent use-after-free and + * call reqsk_put() just before return. + */ + refcount_set(&nreq->rsk_refcnt, 2 + 1); + timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); + + req = nreq; + sk_listener = nsk; + } + + icsk = inet_csk(sk_listener); + net = sock_net(sk_listener); max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -754,6 +846,7 @@ static void reqsk_timer_handler(struct timer_list *t) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ + queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; @@ -778,10 +871,39 @@ static void reqsk_timer_handler(struct timer_list *t) atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer(&req->rsk_timer, jiffies + timeo); + + if (!nreq) + return; + + if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { + /* delete timer */ + inet_csk_reqsk_queue_drop(sk_listener, nreq); + goto no_ownership; + } + + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(oreq); + reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); + reqsk_put(oreq); + + reqsk_put(nreq); return; } + + /* Even if we can clone the req, we may need not retransmit any more + * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another + * CPU may win the "own_req" race so that inet_ehash_insert() fails. + */ + if (nreq) { + __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); +no_ownership: + reqsk_migrate_reset(nreq); + reqsk_queue_removed(queue, nreq); + __reqsk_free(nreq); + } + drop: - inet_csk_reqsk_queue_drop_and_put(sk_listener, req); + inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, @@ -997,12 +1119,42 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { - inet_csk_reqsk_queue_drop(sk, req); - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - if (inet_csk_reqsk_queue_add(sk, req, child)) + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + + if (sk != req->rsk_listener) { + /* another listening sk has been selected, + * migrate the req to it. + */ + struct request_sock *nreq; + + /* hold a refcnt for the nreq->rsk_listener + * which is assigned in inet_reqsk_clone() + */ + sock_hold(sk); + nreq = inet_reqsk_clone(req, sk); + if (!nreq) { + inet_child_forget(sk, req, child); + goto child_put; + } + + refcount_set(&nreq->rsk_refcnt, 1); + if (inet_csk_reqsk_queue_add(sk, nreq, child)) { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + reqsk_put(req); + return child; + } + + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; + } } /* Too bad, another child took ownership of the request, undo. */ +child_put: bh_unlock_sock(child); sock_put(child); return NULL; @@ -1028,14 +1180,40 @@ void inet_csk_listen_stop(struct sock *sk) * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { - struct sock *child = req->sk; + struct sock *child = req->sk, *nsk; + struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); + nsk = reuseport_migrate_sock(sk, child, NULL); + if (nsk) { + nreq = inet_reqsk_clone(req, nsk); + if (nreq) { + refcount_set(&nreq->rsk_refcnt, 1); + + if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQSUCCESS); + reqsk_migrate_reset(req); + } else { + __NET_INC_STATS(sock_net(nsk), + LINUX_MIB_TCPMIGRATEREQFAILURE); + reqsk_migrate_reset(nreq); + __reqsk_free(nreq); + } + + /* inet_csk_reqsk_queue_add() has already + * called inet_child_forget() on failure case. + */ + goto skip_child_forget; + } + } + inet_child_forget(sk, req, child); +skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 93474b1bea4e..e65f4ef024a4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - u16 nlmsg_flags) + u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; @@ -444,6 +444,12 @@ static int inet_twsk_diag_fill(struct sock *sk, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + nlmsg_end(skb, nlh); return 0; } @@ -494,7 +500,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags); + return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); @@ -801,6 +807,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = sk->sk_mark; else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c96866a53a66..80aeaf9e6e16 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -697,7 +697,7 @@ void inet_unhash(struct sock *sk) goto unlock; if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); + reuseport_stop_listen_sock(sk); if (ilb) { inet_unhash2(hashinfo, sk); ilb->count--; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a68bf4c6fe9b..12dca0c85f3c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -107,6 +107,8 @@ module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; +static const struct header_ops ipgre_header_ops; + static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, @@ -364,7 +366,10 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, raw_proto, false) < 0) goto drop; - if (tunnel->dev->type != ARPHRD_NONE) + /* Special case for ipgre_header_parse(), which expects the + * mac_header to point to the outer IP header. + */ + if (tunnel->dev->header_ops == &ipgre_header_ops) skb_pop_mac_header(skb); else skb_reset_mac_header(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c3efc7d658f6..8d8a8da3ae7e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1054,7 +1054,7 @@ static int __ip_append_data(struct sock *sk, unsigned int datalen; unsigned int fraglen; unsigned int fraggap; - unsigned int alloclen; + unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: @@ -1074,35 +1074,39 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; pagedlen = 0; + alloc_extra = hh_len + 15; + alloc_extra += exthdrlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length + fraggap) + alloc_extra += rt->dst.trailer_len; + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else if (!paged) + else if (!paged && + (fraglen + alloc_extra < SKB_MAX_ALLOC || + !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = min_t(int, fraglen, MAX_HEADER); pagedlen = fraglen - alloclen; } - alloclen += exthdrlen; - - /* The last fragment gets additional space at tail. - * Note, with MSG_MORE we overallocate on fragments, - * because we have no idea what fragment will be - * the last. - */ - if (datalen == length + fraggap) - alloclen += rt->dst.trailer_len; + alloclen += alloc_extra; if (transhdrlen) { - skb = sock_alloc_send_skb(sk, - alloclen + hh_len + 15, + skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = alloc_skb(alloclen + hh_len + 15, + skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index bbb56f5e06dd..366094c1ce6c 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -153,7 +153,6 @@ static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) } static const struct xfrm_type ipcomp_type = { - .description = "IPCOMP4", .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp4_init_state, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index d5bfa087c23a..266c65577ba6 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -242,6 +242,8 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (!tun_dst) return 0; } + skb_reset_mac_header(skb); + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 939792a38814..7b12a40dd465 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1317,7 +1317,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } /* called from ip_ra_control(), before an RCU grace period, - * we dont need to call synchronize_rcu() here + * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { @@ -1938,7 +1938,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; - /* For an (*,G) entry, we only check that the incomming + /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); @@ -3007,7 +3007,6 @@ static const struct seq_operations ipmr_mfc_seq_ops = { #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ff437e4ed6db..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 95a718397fd1..1e44a43acfe2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -573,7 +573,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) } } sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: sock_put(sk); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6d46297a99f8..b0d3a09dc84e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -295,6 +295,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), + SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), + SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9a8c0892622b..6913979948d7 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,12 +31,6 @@ EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - if (!prot->netns_ok) { - pr_err("Protocol %u is not namespace aware, cannot register.\n", - protocol); - return -EINVAL; - } - return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 50a73178d63a..bb446e60cf58 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -280,7 +280,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) if (inet->recverr || harderr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } } @@ -929,7 +929,7 @@ int raw_abort(struct sock *sk, int err) lock_sock(sk); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a36ac98476f..99c06944501a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1306,7 +1306,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = READ_ONCE(dst->dev->mtu); @@ -1315,6 +1315,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } +out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); @@ -1906,13 +1907,128 @@ out: hash_keys->addrs.v4addrs.dst = key_iph->daddr; } +static u32 fib_multipath_custom_hash_outer(const struct net *net, + const struct sk_buff *skb, + bool *p_has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_inner(const struct net *net, + const struct sk_buff *skb, + bool has_inner) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys keys, hash_keys; + + /* We assume the packet carries an encapsulation, but if none was + * encountered during dissection of the outer flow, then there is no + * point in calling the flow dissector again. + */ + if (!has_inner) + return 0; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, 0); + + if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) + return 0; + + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) + hash_keys.tags.flow_label = keys.tags.flow_label; + } + + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) + hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) + hash_keys.ports.src = keys.ports.src; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) + hash_keys.ports.dst = keys.ports.dst; + + return flow_hash_from_keys(&hash_keys); +} + +static u32 fib_multipath_custom_hash_skb(const struct net *net, + const struct sk_buff *skb) +{ + u32 mhash, mhash_inner; + bool has_inner = true; + + mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); + mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); + + return jhash_2words(mhash, mhash_inner, 0); +} + +static u32 fib_multipath_custom_hash_fl4(const struct net *net, + const struct flowi4 *fl4) +{ + u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields; + struct flow_keys hash_keys; + + if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) + return 0; + + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) + hash_keys.addrs.v4addrs.src = fl4->saddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) + hash_keys.addrs.v4addrs.dst = fl4->daddr; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) + hash_keys.basic.ip_proto = fl4->flowi4_proto; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) + hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) + hash_keys.ports.dst = fl4->fl4_dport; + + return flow_hash_from_keys(&hash_keys); +} + /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; - u32 mhash; + u32 mhash = 0; switch (net->ipv4.sysctl_fib_multipath_hash_policy) { case 0: @@ -1924,6 +2040,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -1957,6 +2074,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } + mhash = flow_hash_from_keys(&hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -1987,9 +2105,15 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } + mhash = flow_hash_from_keys(&hash_keys); + break; + case 3: + if (skb) + mhash = fib_multipath_custom_hash_skb(net, skb); + else + mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } - mhash = flow_hash_from_keys(&hash_keys); if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a62934b9f15a..6f1e64d49232 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -19,6 +19,7 @@ #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_fib.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> @@ -29,6 +30,7 @@ #include <net/netevent.h> static int two = 2; +static int three __maybe_unused = 3; static int four = 4; static int thousand = 1000; static int tcp_retr1_max = 255; @@ -48,6 +50,8 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static u32 fib_multipath_hash_fields_all_mask __maybe_unused = + FIB_MULTIPATH_HASH_FIELD_ALL_MASK; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -461,6 +465,22 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, return ret; } + +static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_fields); + ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + + return ret; +} #endif static struct ctl_table ipv4_table[] = { @@ -941,6 +961,15 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_migrate_req", + .data = &init_net.ipv4.sysctl_tcp_migrate_req, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { .procname = "tcp_reordering", .data = &init_net.ipv4.sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -1050,7 +1079,16 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, + }, + { + .procname = "fib_multipath_hash_fields", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_fib_multipath_hash_fields, + .extra1 = SYSCTL_ONE, + .extra2 = &fib_multipath_hash_fields_all_mask, }, #endif { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 64bf179cc915..d5ab5f243640 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1738,8 +1738,8 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); -static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) +void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); @@ -2024,8 +2024,6 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss); static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) @@ -2197,8 +2195,8 @@ out: #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ -static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping_internal *tss) +void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; @@ -3061,7 +3059,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.offset = 0; } - sk->sk_error_report(sk); + sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); @@ -4450,7 +4448,7 @@ int tcp_abort(struct sock *sk, int err) sk->sk_err = err; /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); - sk->sk_error_report(sk); + sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..f26916a62f25 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -184,11 +206,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = tcp_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -196,14 +218,9 @@ msg_bytes_ready: sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; -out: release_sock(sk); sk_psock_put(sk, psock); return ret; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index af2814c9342a..47c32604d38f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -526,7 +526,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) if (!tfo_da_times) return false; - /* Limit timout to max: 2^6 * initial timeout */ + /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); timeout = multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf4dd532d1c..e6ca5a1f3b59 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2816,8 +2816,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, + bool *do_lost) { struct tcp_sock *tp = tcp_sk(sk); @@ -2842,7 +2851,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - return true; + } else { + /* Partial ACK arrived. Force fast retransmit. */ + *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -2866,14 +2877,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) } } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2943,17 +2946,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else { - if (tcp_try_undo_partial(sk, prior_snd_una)) - return; - /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_force_fast_retransmit(sk); - } - if (tcp_try_undo_dsack(sk)) { - tcp_try_keep_open(sk); + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) return; - } + + if (tcp_try_undo_dsack(sk)) + tcp_try_keep_open(sk); + tcp_identify_packet_loss(sk, ack_flag); + if (icsk->icsk_ca_state != TCP_CA_Recovery) { + if (!tcp_time_to_recover(sk, flag)) + return; + /* Undo reverts the recovery state. If loss is evident, + * starts a new recovery (e.g. reordering then loss); + */ + tcp_enter_recovery(sk, ece_ack); + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); @@ -4263,7 +4270,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); + sk_error_report(sk); } /* @@ -5885,6 +5892,7 @@ step5: return; csum_error: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 312184cead57..e66ad6bfe808 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -585,7 +585,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (!sock_owned_by_user(sk)) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_done(sk); } else { @@ -613,7 +613,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } @@ -1731,6 +1731,7 @@ discard: return 0; csum_err: + trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1801,6 +1802,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); + trace_tcp_bad_csum(skb); __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -2000,13 +2002,21 @@ process: goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; + nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); + if (!nsk) { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + sk = nsk; + /* reuseport_migrate_sock() has already held one sk_refcnt + * before returning. + */ + } else { + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ + sock_hold(sk); } - /* We own a reference on the listener, increase it again - * as we might lose it too soon. - */ - sock_hold(sk); refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { @@ -2098,6 +2108,7 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: + trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7513ba45553d..0a4f3f16140a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, goto listen_overflow; if (own_req && rsk_drop_req(req)) { - reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_drop_and_put(sk, req); + reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } @@ -786,6 +786,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + if (sk != req->rsk_listener) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ef08079ccfa..20cf4a98c69d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -68,7 +68,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; - sk->sk_error_report(sk); + sk_error_report(sk); tcp_write_queue_purge(sk); tcp_done(sk); @@ -441,7 +441,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * This function gets called when the kernel timer for a TCP packet * of this socket expires. * - * It handles retransmission, timer adjustment and other necesarry measures. + * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ @@ -766,7 +766,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, - * substract one from tp->compressed_ack to keep + * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3bb448761ca3..07c4c93b9fdb 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -221,7 +221,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = { static int __init tcp_yeah_register(void) { - BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_yeah); return 0; } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index e44aaf41a138..5048c47c79b2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -218,7 +218,6 @@ static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, - .netns_ok = 1, }; #if IS_ENABLED(CONFIG_IPV6) @@ -226,7 +225,6 @@ static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, - .netns_ok = 1, }; #endif @@ -235,7 +233,6 @@ static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, - .netns_ok = 1, }; #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..62682807b4b2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -776,7 +776,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); out: return 0; } @@ -1798,11 +1798,13 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, if (used <= 0) { if (!copied) copied = used; + kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } + kfree_skb(skb); if (!desc->count) break; } @@ -2867,7 +2869,7 @@ int udp_abort(struct sock *sk, int err) goto out; sk->sk_err = err; - sk->sk_error_report(sk); + sk_error_report(sk); __udp_disconnect(sk, 0); out: diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..45b8782aec0c 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); } +static bool udp_sk_has_data(struct sock *sk) +{ + return !skb_queue_empty(&udp_sk(sk)->reader_queue) || + !skb_queue_empty(&sk->sk_receive_queue); +} + +static bool psock_has_data(struct sk_psock *psock) +{ + return !skb_queue_empty(&psock->ingress_skb) || + !sk_psock_queue_empty(psock); +} + +#define udp_msg_has_data(__sk, __psock) \ + ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) + +static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = udp_msg_has_data(sk, psock); + if (!ret) { + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + ret = udp_msg_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); - if (sk_psock_queue_empty(psock)) { + if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } @@ -43,26 +81,21 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { - int data, err = 0; long timeo; + int data; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = udp_msg_wait_data(sk, psock, timeo); if (data) { - if (!sk_psock_queue_empty(psock)) + if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } - if (err) { - ret = err; - goto out; - } copied = -EAGAIN; } ret = copied; out: - release_sock(sk); sk_psock_put(sk, psock); return ret; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index bd8773b49e72..cd1cd68adeec 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -31,7 +31,6 @@ static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, - .netns_ok = 1, }; struct proto udplite_prot = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index ea595c8549c7..2fe5860c21d6 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -181,21 +181,18 @@ static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, - .netns_ok = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, - .netns_ok = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index fb0648e7fb32..f4555a88f86b 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -42,7 +42,6 @@ static void ipip_destroy(struct xfrm_state *x) } static const struct xfrm_type ipip_type = { - .description = "IPIP", .owner = THIS_MODULE, .proto = IPPROTO_IPIP, .init_state = ipip_init_state, |