summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c64
-rw-r--r--net/ipv4/bpf_tcp_ca.c2
-rw-r--r--net/ipv4/cipso_ipv4.c4
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c23
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inet_diag.c26
-rw-r--r--net/ipv4/inet_hashtables.c83
-rw-r--r--net/ipv4/inet_timewait_sock.c10
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/ip_sockglue.c405
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ip_vti.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c19
-rw-r--r--net/ipv4/nexthop.c93
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c28
-rw-r--r--net/ipv4/route.c12
-rw-r--r--net/ipv4/tcp.c161
-rw-r--r--net/ipv4/tcp_fastopen.c8
-rw-r--r--net/ipv4/tcp_input.c69
-rw-r--r--net/ipv4/tcp_ipv4.c26
-rw-r--r--net/ipv4/tcp_metrics.c89
-rw-r--r--net/ipv4/tcp_minisocks.c18
-rw-r--r--net/ipv4/tcp_output.c46
-rw-r--r--net/ipv4/tcp_timer.c93
-rw-r--r--net/ipv4/udp.c105
-rw-r--r--net/ipv4/udp_offload.c23
-rw-r--r--net/ipv4/udp_tunnel_core.c2
-rw-r--r--net/ipv4/xfrm4_policy.c11
33 files changed, 773 insertions, 702 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9b2ca2fcc5a1..3d2e30e20473 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -187,24 +187,13 @@ static int inet_autobind(struct sock *sk)
return 0;
}
-/*
- * Move a socket into listening state.
- */
-int inet_listen(struct socket *sock, int backlog)
+int __inet_listen_sk(struct sock *sk, int backlog)
{
- struct sock *sk = sock->sk;
- unsigned char old_state;
+ unsigned char old_state = sk->sk_state;
int err, tcp_fastopen;
- lock_sock(sk);
-
- err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
- goto out;
-
- old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- goto out;
+ return -EINVAL;
WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
@@ -227,10 +216,27 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk);
if (err)
- goto out;
+ return err;
+
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- err = 0;
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ err = __inet_listen_sk(sk, backlog);
out:
release_sock(sk);
@@ -325,14 +331,14 @@ lookup_protocol:
sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
- inet->nodefrag = 0;
+ inet_clear_bit(NODEFRAG, sk);
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
+ inet_set_bit(HDRINCL, sk);
}
if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
@@ -340,7 +346,7 @@ lookup_protocol:
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->inet_id = 0;
+ atomic_set(&inet->inet_id, 0);
sock_init_data(sock, sk);
@@ -350,9 +356,9 @@ lookup_protocol:
sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
inet->uc_ttl = -1;
- inet->mc_loop = 1;
+ inet_set_bit(MC_LOOP, sk);
inet->mc_ttl = 1;
- inet->mc_all = 1;
+ inet_set_bit(MC_ALL, sk);
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
@@ -431,9 +437,8 @@ int inet_release(struct socket *sock)
}
EXPORT_SYMBOL(inet_release);
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
- struct sock *sk = sock->sk;
u32 flags = BIND_WITH_LOCK;
int err;
@@ -454,6 +459,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return __inet_bind(sk, uaddr, addr_len, flags);
}
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ return inet_bind_sk(sock->sk, uaddr, addr_len);
+}
EXPORT_SYMBOL(inet_bind);
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
@@ -519,7 +529,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (snum || !(inet->bind_address_no_port ||
+ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
err = sk->sk_prot->get_port(sk, snum);
if (err) {
@@ -646,7 +656,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- if (inet_sk(sk)->defer_connect)
+ if (inet_test_bit(DEFER_CONNECT, sk))
err = is_sendmsg ? -EINPROGRESS : -EISCONN;
else
err = -EALREADY;
@@ -669,7 +679,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
- if (!err && inet_sk(sk)->defer_connect)
+ if (!err && inet_test_bit(DEFER_CONNECT, sk))
goto out;
/* Just entered SS_CONNECTING state; the only
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 4406d796cc2f..39dcccf0f174 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -51,8 +51,6 @@ static bool is_unsupported(u32 member_offset)
return false;
}
-extern struct btf *btf_vmlinux;
-
static bool bpf_tcp_ca_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 79ae7204e8ed..d048aa833293 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1881,7 +1881,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
old = rcu_dereference_protected(sk_inet->inet_opt,
lockdep_sock_is_held(sk));
- if (sk_inet->is_icsk) {
+ if (inet_test_bit(IS_ICSK, sk)) {
sk_conn = inet_csk(sk);
if (old)
sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
@@ -2051,7 +2051,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
sk_inet = inet_sk(sk);
hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
- if (sk_inet->is_icsk && hdr_delta > 0) {
+ if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) {
struct inet_connection_sock *sk_conn = inet_csk(sk);
sk_conn->icsk_ext_hdr_len -= hdr_delta;
sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4d1af0cd7d99..cb5dbee9e018 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = get_random_u16();
+ atomic_set(&inet->inet_id, get_random_u16());
sk_dst_set(sk, &rt->dst);
err = 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 89087844ea6e..9cf64ee47dd2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -509,6 +509,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return -EEXIST;
}
if (ifa1->ifa_scope != ifa->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
inet_free_ifa(ifa);
return -EINVAL;
}
@@ -664,6 +665,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
in_dev = inetdev_by_index(net, ifm->ifa_index);
if (!in_dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
err = -ENODEV;
goto errout;
}
@@ -688,6 +690,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
}
+ NL_SET_ERR_MSG(extack, "ipv4: Address not found");
err = -EADDRNOTAVAIL;
errout:
return err;
@@ -839,13 +842,23 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
err = -EINVAL;
- if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
+
+ if (ifm->ifa_prefixlen > 32) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
+ goto errout;
+ }
+
+ if (!tb[IFA_LOCAL]) {
+ NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
goto errout;
+ }
dev = __dev_get_by_index(net, ifm->ifa_index);
err = -ENODEV;
- if (!dev)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
goto errout;
+ }
in_dev = __in_dev_get_rtnl(dev);
err = -ENOBUFS;
@@ -897,6 +910,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ci = nla_data(tb[IFA_CACHEINFO]);
if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
err = -EINVAL;
goto errout_free;
}
@@ -954,6 +968,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
int ret = ip_mc_autojoin_config(net, true, ifa);
if (ret < 0) {
+ NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
inet_free_ifa(ifa);
return ret;
}
@@ -967,8 +982,10 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
- !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
return -EEXIST;
+ }
ifa = ifa_existing;
if (ifa->ifa_rt_priority != new_metric) {
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba06ed42e428..2be2d4922557 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1132,7 +1132,7 @@ static int esp_init_authenc(struct xfrm_state *x,
err = crypto_aead_setkey(aead, key, keylen);
free_key:
- kfree(key);
+ kfree_sensitive(key);
error:
return err;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 48ff5f13e797..0c9e768e5628 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2658,7 +2658,7 @@ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
(sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
- ret = inet->mc_all;
+ ret = inet_test_bit(MC_ALL, sk);
if (!pmc)
goto unlock;
psl = rcu_dereference(pmc->sflist);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0cc19cfbb673..aeebe8816689 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1019,7 +1019,7 @@ static void reqsk_timer_handler(struct timer_list *t)
icsk = inet_csk(sk_listener);
net = sock_net(sk_listener);
- max_syn_ack_retries = icsk->icsk_syn_retries ? :
+ max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index b812eb36f0e3..e13a84433413 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -150,7 +150,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark)))
goto errout;
if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
@@ -182,17 +182,17 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_inode = sock_i_ino(sk);
memset(&inet_sockopt, 0, sizeof(inet_sockopt));
- inet_sockopt.recverr = inet->recverr;
- inet_sockopt.is_icsk = inet->is_icsk;
- inet_sockopt.freebind = inet->freebind;
- inet_sockopt.hdrincl = inet->hdrincl;
- inet_sockopt.mc_loop = inet->mc_loop;
- inet_sockopt.transparent = inet->transparent;
- inet_sockopt.mc_all = inet->mc_all;
- inet_sockopt.nodefrag = inet->nodefrag;
- inet_sockopt.bind_address_no_port = inet->bind_address_no_port;
- inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884;
- inet_sockopt.defer_connect = inet->defer_connect;
+ inet_sockopt.recverr = inet_test_bit(RECVERR, sk);
+ inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk);
+ inet_sockopt.freebind = inet_test_bit(FREEBIND, sk);
+ inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk);
+ inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk);
+ inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk);
+ inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk);
+ inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk);
+ inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk);
+ inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk);
if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
&inet_sockopt))
goto errout;
@@ -799,7 +799,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry.ifindex = sk->sk_bound_dev_if;
entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
if (sk_fullsock(sk))
- entry.mark = sk->sk_mark;
+ entry.mark = READ_ONCE(sk->sk_mark);
else if (sk->sk_state == TCP_NEW_SYN_RECV)
entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
else if (sk->sk_state == TCP_TIME_WAIT)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e7391bf310a7..7876b7d703cb 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -28,9 +28,9 @@
#include <net/tcp.h>
#include <net/sock_reuseport.h>
-static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
static u32 inet_ehash_secret __read_mostly;
@@ -39,6 +39,7 @@ static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
return __inet_ehashfn(laddr, lport, faddr, fport,
inet_ehash_secret + net_hash_mix(net));
}
+EXPORT_SYMBOL_GPL(inet_ehashfn);
/* This function handles inet_sock, but also timewait and request sockets
* for IPv4/IPv6.
@@ -332,20 +333,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
- struct sk_buff *skb, int doff,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum)
+/**
+ * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ * the selected sock or an error.
+ */
+struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum,
+ inet_ehashfn_t *ehashfn)
{
struct sock *reuse_sk = NULL;
u32 phash;
if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
+ net, daddr, hnum, saddr, sport);
reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
}
return reuse_sk;
}
+EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
/*
* Here are some nice properties to exploit here. The BSD API
@@ -369,8 +388,8 @@ static struct sock *inet_lhash2_lookup(struct net *net,
sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
- result = lookup_reuseport(net, sk, skb, doff,
- saddr, sport, daddr, hnum);
+ result = inet_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, inet_ehashfn);
if (result)
return result;
@@ -382,24 +401,23 @@ static struct sock *inet_lhash2_lookup(struct net *net,
return result;
}
-static inline struct sock *inet_lookup_run_bpf(struct net *net,
- struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
- __be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum, const int dif)
+struct sock *inet_lookup_run_sk_lookup(struct net *net,
+ int protocol,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, u16 hnum, const int dif,
+ inet_ehashfn_t *ehashfn)
{
struct sock *sk, *reuse_sk;
bool no_reuseport;
- if (hashinfo != net->ipv4.tcp_death_row.hashinfo)
- return NULL; /* only TCP is supported */
-
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
+ no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
daddr, hnum, dif, &sk);
if (no_reuseport || IS_ERR_OR_NULL(sk))
return sk;
- reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+ reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
+ ehashfn);
if (reuse_sk)
sk = reuse_sk;
return sk;
@@ -417,9 +435,11 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
- result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
- saddr, sport, daddr, hnum, dif);
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+ saddr, sport, daddr, hnum, dif,
+ inet_ehashfn);
if (result)
goto done;
}
@@ -650,20 +670,8 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
spin_lock(lock);
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
- ret = sk_hashed(osk);
- if (ret) {
- /* Before deleting the node, we insert a new one to make
- * sure that the look-up-sk process would not miss either
- * of them and that at least one node would exist in ehash
- * table all the time. Otherwise there's a tiny chance
- * that lookup process could find nothing in ehash table.
- */
- __sk_nulls_add_node_tail_rcu(sk, list);
- sk_nulls_del_node_init_rcu(osk);
- }
- goto unlock;
- }
- if (found_dup_sk) {
+ ret = sk_nulls_del_node_init_rcu(osk);
+ } else if (found_dup_sk) {
*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
if (*found_dup_sk)
ret = false;
@@ -672,7 +680,6 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ret)
__sk_nulls_add_node_rcu(sk, list);
-unlock:
spin_unlock(lock);
return ret;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 40052414c7c7..dd37a5bf6881 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -88,10 +88,10 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
-static void inet_twsk_add_node_tail_rcu(struct inet_timewait_sock *tw,
- struct hlist_nulls_head *list)
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
{
- hlist_nulls_add_tail_rcu(&tw->tw_node, list);
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
}
static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
@@ -144,7 +144,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
- inet_twsk_add_node_tail_rcu(tw, &ehead->chain);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
if (__sk_nulls_del_node_init_rcu(sk))
@@ -203,7 +203,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_reuseport = sk->sk_reuseport;
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
- tw->tw_transparent = inet->transparent;
+ tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 81a1cce1a7d1..22a26d1d29a0 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -548,7 +548,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
if (skb->len > dev->mtu + dev->hard_header_len) {
- pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto err_free_skb;
truncate = true;
}
@@ -689,7 +690,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
goto free_skb;
if (skb->len > dev->mtu + dev->hard_header_len) {
- pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto free_skb;
truncate = true;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6e70839257f7..43ba4b77b248 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(const struct inet_sock *inet,
const struct dst_entry *dst)
{
- int ttl = inet->uc_ttl;
+ int ttl = READ_ONCE(inet->uc_ttl);
if (ttl < 0)
ttl = ip4_dst_hoplimit(dst);
@@ -184,9 +184,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
ip_options_build(skb, &opt->opt, daddr, rt);
}
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
if (!skb->mark)
- skb->mark = sk->sk_mark;
+ skb->mark = READ_ONCE(sk->sk_mark);
/* Send it out. */
return ip_local_out(net, skb->sk, skb);
@@ -216,7 +216,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
- if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ if (res != LWTUNNEL_XMIT_CONTINUE)
return res;
}
@@ -236,7 +236,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
- return -EINVAL;
+ return PTR_ERR(neigh);
}
static int ip_finish_output_gso(struct net *net, struct sock *sk,
@@ -528,8 +528,8 @@ packet_routed:
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->priority = READ_ONCE(sk->sk_priority);
+ skb->mark = READ_ONCE(sk->sk_mark);
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
@@ -1039,7 +1039,7 @@ static int __ip_append_data(struct sock *sk,
}
}
} else if ((flags & MSG_SPLICE_PAGES) && length) {
- if (inet->hdrincl)
+ if (inet_test_bit(HDRINCL, sk))
return -EPERM;
if (rt->dst.dev->features & NETIF_F_SG &&
getfrag == ip_generic_getfrag)
@@ -1158,10 +1158,15 @@ alloc_new_skb:
}
copy = datalen - transhdrlen - fraggap - pagedlen;
+ /* [!] NOTE: copy will be negative if pagedlen>0
+ * because then the equation reduces to -fraggap.
+ */
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ copy = 0;
}
offset += copy;
@@ -1209,6 +1214,10 @@ alloc_new_skb:
} else if (flags & MSG_SPLICE_PAGES) {
struct msghdr *msg = from;
+ err = -EIO;
+ if (WARN_ON_ONCE(copy > msg->msg_iter.count))
+ goto error;
+
err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
sk->sk_allocation);
if (err < 0)
@@ -1458,7 +1467,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
* so icmphdr does not in skb linear region and can not get icmp_type
* by icmp_hdr(skb)->type.
*/
- if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl)
+ if (sk->sk_type == SOCK_RAW &&
+ !inet_test_bit(HDRINCL, sk))
icmp_type = fl4->fl4_icmp_type;
else
icmp_type = icmp_hdr(skb)->type;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8e97d8d4cc9d..d1c73660b844 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -171,8 +171,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(sk);
- unsigned int flags = inet->cmsg_flags;
+ unsigned long flags = inet_cmsg_flags(inet_sk(sk));
+
+ if (!flags)
+ return;
/* Ordered by supposed usage frequency */
if (flags & IP_CMSG_PKTINFO) {
@@ -431,7 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
serr->port = port;
if (skb_pull(skb, payload - skb->data)) {
- if (inet_sk(sk)->recverr_rfc4884)
+ if (inet_test_bit(RECVERR_RFC4884, sk))
ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
skb_reset_transport_header(skb);
@@ -444,12 +446,11 @@ EXPORT_SYMBOL_GPL(ip_icmp_error);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
{
- struct inet_sock *inet = inet_sk(sk);
struct sock_exterr_skb *serr;
struct iphdr *iph;
struct sk_buff *skb;
- if (!inet->recverr)
+ if (!inet_test_bit(RECVERR, sk))
return;
skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
@@ -568,7 +569,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- if (inet_sk(sk)->cmsg_flags)
+ if (inet_cmsg_flags(inet_sk(sk)))
ip_cmsg_recv(msg, skb);
}
@@ -592,7 +593,7 @@ void __ip_sock_set_tos(struct sock *sk, int val)
}
if (inet_sk(sk)->tos != val) {
inet_sk(sk)->tos = val;
- sk->sk_priority = rt_tos2priority(val);
+ WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
sk_dst_reset(sk);
}
}
@@ -607,17 +608,13 @@ EXPORT_SYMBOL(ip_sock_set_tos);
void ip_sock_set_freebind(struct sock *sk)
{
- lock_sock(sk);
- inet_sk(sk)->freebind = true;
- release_sock(sk);
+ inet_set_bit(FREEBIND, sk);
}
EXPORT_SYMBOL(ip_sock_set_freebind);
void ip_sock_set_recverr(struct sock *sk)
{
- lock_sock(sk);
- inet_sk(sk)->recverr = true;
- release_sock(sk);
+ inet_set_bit(RECVERR, sk);
}
EXPORT_SYMBOL(ip_sock_set_recverr);
@@ -634,9 +631,7 @@ EXPORT_SYMBOL(ip_sock_set_mtu_discover);
void ip_sock_set_pktinfo(struct sock *sk)
{
- lock_sock(sk);
- inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO;
- release_sock(sk);
+ inet_set_bit(PKTINFO, sk);
}
EXPORT_SYMBOL(ip_sock_set_pktinfo);
@@ -950,6 +945,102 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IP_PKTINFO:
+ inet_assign_bit(PKTINFO, sk, val);
+ return 0;
+ case IP_RECVTTL:
+ inet_assign_bit(TTL, sk, val);
+ return 0;
+ case IP_RECVTOS:
+ inet_assign_bit(TOS, sk, val);
+ return 0;
+ case IP_RECVOPTS:
+ inet_assign_bit(RECVOPTS, sk, val);
+ return 0;
+ case IP_RETOPTS:
+ inet_assign_bit(RETOPTS, sk, val);
+ return 0;
+ case IP_PASSSEC:
+ inet_assign_bit(PASSSEC, sk, val);
+ return 0;
+ case IP_RECVORIGDSTADDR:
+ inet_assign_bit(ORIGDSTADDR, sk, val);
+ return 0;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ return -EINVAL;
+ inet_assign_bit(RECVFRAGSIZE, sk, val);
+ return 0;
+ case IP_RECVERR:
+ inet_assign_bit(RECVERR, sk, val);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IP_RECVERR_RFC4884:
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet_assign_bit(RECVERR_RFC4884, sk, val);
+ return 0;
+ case IP_FREEBIND:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(FREEBIND, sk, val);
+ return 0;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(HDRINCL, sk, val);
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(MC_LOOP, sk, val);
+ return 0;
+ case IP_MULTICAST_ALL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != 0 && val != 1)
+ return -EINVAL;
+ inet_assign_bit(MC_ALL, sk, val);
+ return 0;
+ case IP_TRANSPARENT:
+ if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(TRANSPARENT, sk, val);
+ return 0;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(NODEFRAG, sk, val);
+ return 0;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val);
+ return 0;
+ case IP_TTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != -1 && (val < 1 || val > 255))
+ return -EINVAL;
+ WRITE_ONCE(inet->uc_ttl, val);
+ return 0;
+ case IP_MINTTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip4_min_ttl);
+
+ WRITE_ONCE(inet->min_ttl, val);
+ return 0;
+ }
+
err = 0;
if (needs_rtnl)
rtnl_lock();
@@ -967,7 +1058,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
break;
old = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
- if (inet->is_icsk) {
+ if (inet_test_bit(IS_ICSK, sk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == PF_INET ||
@@ -989,111 +1080,27 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
kfree_rcu(old, rcu);
break;
}
- case IP_PKTINFO:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PKTINFO;
- else
- inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
- break;
- case IP_RECVTTL:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TTL;
- else
- inet->cmsg_flags &= ~IP_CMSG_TTL;
- break;
- case IP_RECVTOS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TOS;
- else
- inet->cmsg_flags &= ~IP_CMSG_TOS;
- break;
- case IP_RECVOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RECVOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
- break;
- case IP_RETOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RETOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
- break;
- case IP_PASSSEC:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PASSSEC;
- else
- inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
- break;
- case IP_RECVORIGDSTADDR:
- if (val)
- inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
- else
- inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
- break;
case IP_CHECKSUM:
if (val) {
- if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
+ if (!(inet_test_bit(CHECKSUM, sk))) {
inet_inc_convert_csum(sk);
- inet->cmsg_flags |= IP_CMSG_CHECKSUM;
+ inet_set_bit(CHECKSUM, sk);
}
} else {
- if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
+ if (inet_test_bit(CHECKSUM, sk)) {
inet_dec_convert_csum(sk);
- inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
+ inet_clear_bit(CHECKSUM, sk);
}
}
break;
- case IP_RECVFRAGSIZE:
- if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
- goto e_inval;
- if (val)
- inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
- else
- inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
- break;
case IP_TOS: /* This sets both TOS and Precedence */
__ip_sock_set_tos(sk, val);
break;
- case IP_TTL:
- if (optlen < 1)
- goto e_inval;
- if (val != -1 && (val < 1 || val > 255))
- goto e_inval;
- inet->uc_ttl = val;
- break;
- case IP_HDRINCL:
- if (sk->sk_type != SOCK_RAW) {
- err = -ENOPROTOOPT;
- break;
- }
- inet->hdrincl = val ? 1 : 0;
- break;
- case IP_NODEFRAG:
- if (sk->sk_type != SOCK_RAW) {
- err = -ENOPROTOOPT;
- break;
- }
- inet->nodefrag = val ? 1 : 0;
- break;
- case IP_BIND_ADDRESS_NO_PORT:
- inet->bind_address_no_port = val ? 1 : 0;
- break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
inet->pmtudisc = val;
break;
- case IP_RECVERR:
- inet->recverr = !!val;
- if (!val)
- skb_queue_purge(&sk->sk_error_queue);
- break;
- case IP_RECVERR_RFC4884:
- if (val < 0 || val > 1)
- goto e_inval;
- inet->recverr_rfc4884 = !!val;
- break;
case IP_MULTICAST_TTL:
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -1105,11 +1112,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
inet->mc_ttl = val;
break;
- case IP_MULTICAST_LOOP:
- if (optlen < 1)
- goto e_inval;
- inet->mc_loop = !!val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1214,7 +1216,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
struct ip_mreqn mreq;
err = -EPROTO;
- if (inet_sk(sk)->is_icsk)
+ if (inet_test_bit(IS_ICSK, sk))
break;
if (optlen < sizeof(struct ip_mreq))
@@ -1325,20 +1327,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
else
err = ip_set_mcast_msfilter(sk, optval, optlen);
break;
- case IP_MULTICAST_ALL:
- if (optlen < 1)
- goto e_inval;
- if (val != 0 && val != 1)
- goto e_inval;
- inet->mc_all = val;
- break;
-
- case IP_FREEBIND:
- if (optlen < 1)
- goto e_inval;
- inet->freebind = !!val;
- break;
-
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
err = -EPERM;
@@ -1347,32 +1335,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
- case IP_TRANSPARENT:
- if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- err = -EPERM;
- break;
- }
- if (optlen < 1)
- goto e_inval;
- inet->transparent = !!val;
- break;
-
- case IP_MINTTL:
- if (optlen < 1)
- goto e_inval;
- if (val < 0 || val > 255)
- goto e_inval;
-
- if (val)
- static_branch_enable(&ip4_min_ttl);
-
- /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
- * while we are changint it.
- */
- WRITE_ONCE(inet->min_ttl, val);
- break;
-
case IP_LOCAL_PORT_RANGE:
{
const __u16 lo = val;
@@ -1415,7 +1377,7 @@ e_inval:
void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
- bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ bool prepare = inet_test_bit(PKTINFO, sk) ||
ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) {
@@ -1566,6 +1528,72 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (len < 0)
return -EINVAL;
+ /* Handle options that can be read without locking the socket. */
+ switch (optname) {
+ case IP_PKTINFO:
+ val = inet_test_bit(PKTINFO, sk);
+ goto copyval;
+ case IP_RECVTTL:
+ val = inet_test_bit(TTL, sk);
+ goto copyval;
+ case IP_RECVTOS:
+ val = inet_test_bit(TOS, sk);
+ goto copyval;
+ case IP_RECVOPTS:
+ val = inet_test_bit(RECVOPTS, sk);
+ goto copyval;
+ case IP_RETOPTS:
+ val = inet_test_bit(RETOPTS, sk);
+ goto copyval;
+ case IP_PASSSEC:
+ val = inet_test_bit(PASSSEC, sk);
+ goto copyval;
+ case IP_RECVORIGDSTADDR:
+ val = inet_test_bit(ORIGDSTADDR, sk);
+ goto copyval;
+ case IP_CHECKSUM:
+ val = inet_test_bit(CHECKSUM, sk);
+ goto copyval;
+ case IP_RECVFRAGSIZE:
+ val = inet_test_bit(RECVFRAGSIZE, sk);
+ goto copyval;
+ case IP_RECVERR:
+ val = inet_test_bit(RECVERR, sk);
+ goto copyval;
+ case IP_RECVERR_RFC4884:
+ val = inet_test_bit(RECVERR_RFC4884, sk);
+ goto copyval;
+ case IP_FREEBIND:
+ val = inet_test_bit(FREEBIND, sk);
+ goto copyval;
+ case IP_HDRINCL:
+ val = inet_test_bit(HDRINCL, sk);
+ goto copyval;
+ case IP_MULTICAST_LOOP:
+ val = inet_test_bit(MC_LOOP, sk);
+ goto copyval;
+ case IP_MULTICAST_ALL:
+ val = inet_test_bit(MC_ALL, sk);
+ goto copyval;
+ case IP_TRANSPARENT:
+ val = inet_test_bit(TRANSPARENT, sk);
+ goto copyval;
+ case IP_NODEFRAG:
+ val = inet_test_bit(NODEFRAG, sk);
+ goto copyval;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ goto copyval;
+ case IP_TTL:
+ val = READ_ONCE(inet->uc_ttl);
+ if (val < 0)
+ val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl);
+ goto copyval;
+ case IP_MINTTL:
+ val = READ_ONCE(inet->min_ttl);
+ goto copyval;
+ }
+
if (needs_rtnl)
rtnl_lock();
sockopt_lock_sock(sk);
@@ -1600,53 +1628,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
- case IP_PKTINFO:
- val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
- break;
- case IP_RECVTTL:
- val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
- break;
- case IP_RECVTOS:
- val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
- break;
- case IP_RECVOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
- break;
- case IP_RETOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
- break;
- case IP_PASSSEC:
- val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
- break;
- case IP_RECVORIGDSTADDR:
- val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
- break;
- case IP_CHECKSUM:
- val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
- break;
- case IP_RECVFRAGSIZE:
- val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
- break;
case IP_TOS:
val = inet->tos;
break;
- case IP_TTL:
- {
- struct net *net = sock_net(sk);
- val = (inet->uc_ttl == -1 ?
- READ_ONCE(net->ipv4.sysctl_ip_default_ttl) :
- inet->uc_ttl);
- break;
- }
- case IP_HDRINCL:
- val = inet->hdrincl;
- break;
- case IP_NODEFRAG:
- val = inet->nodefrag;
- break;
- case IP_BIND_ADDRESS_NO_PORT:
- val = inet->bind_address_no_port;
- break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
@@ -1665,18 +1649,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
}
break;
}
- case IP_RECVERR:
- val = inet->recverr;
- break;
- case IP_RECVERR_RFC4884:
- val = inet->recverr_rfc4884;
- break;
case IP_MULTICAST_TTL:
val = inet->mc_ttl;
break;
- case IP_MULTICAST_LOOP:
- val = inet->mc_loop;
- break;
case IP_UNICAST_IF:
val = (__force int)htonl((__u32) inet->uc_index);
break;
@@ -1715,9 +1690,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
else
err = ip_get_mcast_msfilter(sk, optval, optlen, len);
goto out;
- case IP_MULTICAST_ALL:
- val = inet->mc_all;
- break;
case IP_PKTOPTIONS:
{
struct msghdr msg;
@@ -1737,7 +1709,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
msg.msg_controllen = len;
msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
- if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+ if (inet_test_bit(PKTINFO, sk)) {
struct in_pktinfo info;
info.ipi_addr.s_addr = inet->inet_rcv_saddr;
@@ -1745,26 +1717,17 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
info.ipi_ifindex = inet->mc_index;
put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
- if (inet->cmsg_flags & IP_CMSG_TTL) {
+ if (inet_test_bit(TTL, sk)) {
int hlim = inet->mc_ttl;
put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
}
- if (inet->cmsg_flags & IP_CMSG_TOS) {
+ if (inet_test_bit(TOS, sk)) {
int tos = inet->rcv_tos;
put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
}
len -= msg.msg_controllen;
return copy_to_sockptr(optlen, &len, sizeof(int));
}
- case IP_FREEBIND:
- val = inet->freebind;
- break;
- case IP_TRANSPARENT:
- val = inet->transparent;
- break;
- case IP_MINTTL:
- val = inet->min_ttl;
- break;
case IP_LOCAL_PORT_RANGE:
val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
break;
@@ -1776,7 +1739,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
}
sockopt_release_sock(sk);
-
+copyval:
if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
unsigned char ucval = (unsigned char)val;
len = 1;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 92c02c886fe7..586b1b3e35b8 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -224,7 +224,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
.un.frag.__unused = 0,
.un.frag.mtu = htons(mtu),
};
- icmph->checksum = ip_compute_csum(icmph, len);
+ icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
skb_reset_transport_header(skb);
niph = skb_push(skb, sizeof(*niph));
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 53bfd8af6920..d1e7d0ceb7ed 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -287,12 +287,12 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
switch (skb->protocol) {
case htons(ETH_P_IP):
- xfrm_decode_session(skb, &fl, AF_INET);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(skb, &fl, AF_INET);
break;
case htons(ETH_P_IPV6):
- xfrm_decode_session(skb, &fl, AF_INET6);
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(skb, &fl, AF_INET6);
break;
default:
goto tx_err;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index e61ea428ea18..265b39bc435b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -7,6 +7,7 @@
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/module.h>
+#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <net/netns/generic.h>
#include <net/route.h>
@@ -65,7 +66,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
struct sock *sk = skb->sk;
if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
- inet_sk(sk)->nodefrag)
+ inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -113,17 +114,31 @@ static void __net_exit defrag4_net_exit(struct net *net)
}
}
+static const struct nf_defrag_hook defrag_hook = {
+ .owner = THIS_MODULE,
+ .enable = nf_defrag_ipv4_enable,
+ .disable = nf_defrag_ipv4_disable,
+};
+
static struct pernet_operations defrag4_net_ops = {
.exit = defrag4_net_exit,
};
static int __init nf_defrag_init(void)
{
- return register_pernet_subsys(&defrag4_net_ops);
+ int err;
+
+ err = register_pernet_subsys(&defrag4_net_ops);
+ if (err)
+ return err;
+
+ rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook);
+ return err;
}
static void __exit nf_defrag_fini(void)
{
+ rcu_assign_pointer(nf_defrag_v4_hook, NULL);
unregister_pernet_subsys(&defrag4_net_ops);
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index f95142e56da0..bbff68b5b5d4 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1152,41 +1152,64 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
return !!(state & NUD_VALID);
}
-static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
+static bool nexthop_is_good_nh(const struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference(nh->nh_info);
+
+ switch (nhi->family) {
+ case AF_INET:
+ return ipv4_good_nh(&nhi->fib_nh);
+ case AF_INET6:
+ return ipv6_good_nh(&nhi->fib6_nh);
+ }
+
+ return false;
+}
+
+static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
{
- struct nexthop *rc = NULL;
int i;
- for (i = 0; i < nhg->num_nh; ++i) {
+ for (i = 0; i < nhg->num_nh; i++) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
- struct nh_info *nhi;
if (hash > atomic_read(&nhge->hthr.upper_bound))
continue;
- nhi = rcu_dereference(nhge->nh->nh_info);
- if (nhi->fdb_nh)
- return nhge->nh;
+ return nhge->nh;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
+{
+ struct nexthop *rc = NULL;
+ int i;
+
+ if (nhg->fdb_nh)
+ return nexthop_select_path_fdb(nhg, hash);
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
/* nexthops always check if it is good and does
* not rely on a sysctl for this behavior
*/
- switch (nhi->family) {
- case AF_INET:
- if (ipv4_good_nh(&nhi->fib_nh))
- return nhge->nh;
- break;
- case AF_INET6:
- if (ipv6_good_nh(&nhi->fib6_nh))
- return nhge->nh;
- break;
- }
+ if (!nexthop_is_good_nh(nhge->nh))
+ continue;
if (!rc)
rc = nhge->nh;
+
+ if (hash > atomic_read(&nhge->hthr.upper_bound))
+ continue;
+
+ return nhge->nh;
}
- return rc;
+ return rc ? : nhg->nh_entries[0].nh;
}
static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
@@ -3186,7 +3209,6 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb,
return err;
}
- ctx->idx++;
return 0;
}
@@ -3221,13 +3243,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
&rtm_dump_nexthop_cb, &filter);
if (err < 0) {
if (likely(skb->len))
- goto out;
- goto out_err;
+ err = skb->len;
}
-out:
- err = skb->len;
-out_err:
cb->seq = net->nexthop.seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
return err;
@@ -3318,7 +3336,6 @@ static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
struct rtm_dump_res_bucket_ctx {
struct rtm_dump_nh_ctx nh;
u16 bucket_index;
- u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */
};
static struct rtm_dump_res_bucket_ctx *
@@ -3347,9 +3364,6 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
u16 bucket_index;
int err;
- if (dd->ctx->nh.idx < dd->ctx->done_nh_idx)
- return 0;
-
nhg = rtnl_dereference(nh->nh_grp);
res_table = rtnl_dereference(nhg->res_table);
for (bucket_index = dd->ctx->bucket_index;
@@ -3367,25 +3381,18 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
dd->filter.res_bucket_nh_id != nhge->nh->id)
continue;
+ dd->ctx->bucket_index = bucket_index;
err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
RTM_NEWNEXTHOPBUCKET, portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->extack);
- if (err < 0) {
- if (likely(skb->len))
- goto out;
- goto out_err;
- }
+ if (err)
+ return err;
}
- dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
- bucket_index = 0;
+ dd->ctx->bucket_index = 0;
-out:
- err = skb->len;
-out_err:
- dd->ctx->bucket_index = bucket_index;
- return err;
+ return 0;
}
static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
@@ -3434,13 +3441,9 @@ static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
if (err < 0) {
if (likely(skb->len))
- goto out;
- goto out_err;
+ err = skb->len;
}
-out:
- err = skb->len;
-out_err:
cb->seq = net->nexthop.seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
return err;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 25dd78cee179..75e0aee35eb7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -580,7 +580,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
- if ((family == AF_INET && !inet_sock->recverr) ||
+ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
(family == AF_INET6 && !inet6_sk(sk)->recverr)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -894,7 +894,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
*addr_len = sizeof(*sin);
}
- if (isk->cmsg_flags)
+ if (inet_cmsg_flags(isk))
ip_cmsg_recv(msg, skb);
#if IS_ENABLED(CONFIG_IPV6)
@@ -921,7 +921,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
if (skb->protocol == htons(ETH_P_IPV6) &&
inet6_sk(sk)->rxopt.all)
pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
- else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ else if (skb->protocol == htons(ETH_P_IP) &&
+ inet_cmsg_flags(isk))
ip_cmsg_recv(msg, skb);
#endif
} else {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 7782ff5e6539..4b5db5d1edc2 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -203,8 +203,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
struct inet_sock *inet = inet_sk(sk);
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
- int err = 0;
int harderr = 0;
+ bool recverr;
+ int err = 0;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_sk_update_pmtu(skb, sk, info);
@@ -218,7 +219,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
2. Socket is connected (otherwise the error indication
is useless without ip_recverr and error is hard.
*/
- if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ recverr = inet_test_bit(RECVERR, sk);
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
switch (type) {
@@ -245,16 +247,16 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
}
}
- if (inet->recverr) {
+ if (recverr) {
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 *payload = skb->data + (iph->ihl << 2);
- if (inet->hdrincl)
+ if (inet_test_bit(HDRINCL, sk))
payload = skb->data;
ip_icmp_error(sk, skb, err, 0, info, payload);
}
- if (inet->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
sk_error_report(sk);
}
@@ -348,7 +350,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
goto error;
skb_reserve(skb, hlen);
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
skb->mark = sockc->mark;
skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
@@ -413,7 +415,7 @@ error_free:
kfree_skb(skb);
error:
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
- if (err == -ENOBUFS && !inet->recverr)
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
err = 0;
return err;
}
@@ -489,12 +491,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (len > 0xFFFF)
goto out;
- /* hdrincl should be READ_ONCE(inet->hdrincl)
- * but READ_ONCE() doesn't work with bit fields.
- * Doing this indirectly yields the same result.
- */
- hdrincl = inet->hdrincl;
- hdrincl = READ_ONCE(hdrincl);
+ hdrincl = inet_test_bit(HDRINCL, sk);
+
/*
* Check the flags.
*/
@@ -645,7 +643,7 @@ back_from_confirm:
ip_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE)) {
err = ip_push_pending_frames(sk, &fl4);
- if (err == -ENOBUFS && !inet->recverr)
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
err = 0;
}
release_sock(sk);
@@ -767,7 +765,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv(msg, skb);
if (flags & MSG_TRUNC)
copied = skb->len;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e7e9fba0357a..d8c99bdc6170 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -515,13 +515,12 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
__u8 scope = RT_SCOPE_UNIVERSE;
if (sk) {
- const struct inet_sock *inet = inet_sk(sk);
-
oif = sk->sk_bound_dev_if;
- mark = sk->sk_mark;
+ mark = READ_ONCE(sk->sk_mark);
tos = ip_sock_rt_tos(sk);
scope = ip_sock_rt_scope(sk);
- prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+ prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
+ sk->sk_protocol;
}
flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
@@ -552,10 +551,11 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
ip_sock_rt_scope(sk),
- inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_test_bit(HDRINCL, sk) ?
+ IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e03e08745308..b1559481898d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ tcp_scaling_ratio_init(sk);
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
@@ -582,7 +583,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
- } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
/* Active TCP fastopen socket with defer_connect
* Return EPOLLOUT so application can call write()
* in order for kernel to generate SYN+data
@@ -1006,7 +1008,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
tp->fastopen_req->size = size;
tp->fastopen_req->uarg = uarg;
- if (inet->defer_connect) {
+ if (inet_test_bit(DEFER_CONNECT, sk)) {
err = tcp_connect(sk);
/* Same failure procedure as in tcp_v4/6_connect */
if (err) {
@@ -1024,7 +1026,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
if (tp->fastopen_req) {
*copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
- inet->defer_connect = 0;
+ inet_clear_bit(DEFER_CONNECT, sk);
}
return err;
}
@@ -1065,7 +1067,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_SPLICE_PAGES;
}
- if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+ if (unlikely(flags & MSG_FASTOPEN ||
+ inet_test_bit(DEFER_CONNECT, sk)) &&
!tp->repair) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
if (err == -EINPROGRESS && copied_syn > 0)
@@ -1700,7 +1703,7 @@ EXPORT_SYMBOL(tcp_peek_len);
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
- int cap;
+ int space, cap;
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
@@ -1715,10 +1718,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0;
- val <<= 1;
- if (val > sk->sk_rcvbuf) {
- WRITE_ONCE(sk->sk_rcvbuf, val);
- tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+ space = tcp_space_from_win(sk, val);
+ if (space > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, space);
+ tcp_sk(sk)->window_clamp = val;
}
return 0;
}
@@ -1739,7 +1742,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb,
}
#ifdef CONFIG_MMU
-const struct vm_operations_struct tcp_vm_ops = {
+static const struct vm_operations_struct tcp_vm_ops = {
};
int tcp_mmap(struct file *file, struct socket *sock,
@@ -2042,13 +2045,10 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
unsigned long address,
bool *mmap_locked)
{
- struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
-#ifdef CONFIG_PER_VMA_LOCK
- vma = lock_vma_under_rcu(mm, address);
-#endif
if (vma) {
- if (!vma_is_tcp(vma)) {
+ if (vma->vm_ops != &tcp_vm_ops) {
vma_end_read(vma);
return NULL;
}
@@ -2058,7 +2058,7 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
mmap_read_lock(mm);
vma = vma_lookup(mm, address);
- if (!vma || !vma_is_tcp(vma)) {
+ if (!vma || vma->vm_ops != &tcp_vm_ops) {
mmap_read_unlock(mm);
return NULL;
}
@@ -2864,7 +2864,7 @@ adjudge_to_death:
if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
@@ -3087,7 +3087,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
- inet->defer_connect = 0;
+ inet_clear_bit(DEFER_CONNECT, sk);
tp->fastopen_client_fail = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -3290,18 +3290,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_SYNCNT)
return -EINVAL;
- lock_sock(sk);
- inet_csk(sk)->icsk_syn_retries = val;
- release_sock(sk);
+ WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);
-void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
+int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
- lock_sock(sk);
- inet_csk(sk)->icsk_user_timeout = val;
- release_sock(sk);
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
+ return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);
@@ -3312,7 +3315,8 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPIDLE)
return -EINVAL;
- tp->keepalive_time = val * HZ;
+ /* Paired with WRITE_ONCE() in keepalive_time_when() */
+ WRITE_ONCE(tp->keepalive_time, val * HZ);
if (sock_flag(sk, SOCK_KEEPOPEN) &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
u32 elapsed = keepalive_time_elapsed(tp);
@@ -3343,9 +3347,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPINTVL)
return -EINVAL;
- lock_sock(sk);
- tcp_sk(sk)->keepalive_intvl = val * HZ;
- release_sock(sk);
+ WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);
@@ -3355,9 +3357,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPCNT)
return -EINVAL;
- lock_sock(sk);
- tcp_sk(sk)->keepalive_probes = val;
- release_sock(sk);
+ /* Paired with READ_ONCE() in keepalive_probes() */
+ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
@@ -3459,6 +3460,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case TCP_SYNCNT:
+ return tcp_sock_set_syncnt(sk, val);
+ case TCP_USER_TIMEOUT:
+ return tcp_sock_set_user_timeout(sk, val);
+ case TCP_KEEPINTVL:
+ return tcp_sock_set_keepintvl(sk, val);
+ case TCP_KEEPCNT:
+ return tcp_sock_set_keepcnt(sk, val);
+ case TCP_LINGER2:
+ if (val < 0)
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
+ else
+ WRITE_ONCE(tp->linger2, val * HZ);
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
+ return 0;
+ }
+
sockopt_lock_sock(sk);
switch (optname) {
@@ -3554,25 +3581,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_KEEPIDLE:
err = tcp_sock_set_keepidle_locked(sk, val);
break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
- err = -EINVAL;
- else
- tp->keepalive_intvl = val * HZ;
- break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
- err = -EINVAL;
- else
- tp->keepalive_probes = val;
- break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
- err = -EINVAL;
- else
- icsk->icsk_syn_retries = val;
- break;
-
case TCP_SAVE_SYN:
/* 0: disable, 1: enable, 2: start from ether_header */
if (val < 0 || val > 2)
@@ -3581,22 +3589,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->save_syn = val;
break;
- case TCP_LINGER2:
- if (val < 0)
- tp->linger2 = -1;
- else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
- tp->linger2 = TCP_FIN_TIMEOUT_MAX;
- else
- tp->linger2 = val * HZ;
- break;
-
- case TCP_DEFER_ACCEPT:
- /* Translate value in seconds to number of retransmits */
- icsk->icsk_accept_queue.rskq_defer_accept =
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
- TCP_RTO_MAX / HZ);
- break;
-
case TCP_WINDOW_CLAMP:
err = tcp_set_window_clamp(sk, val);
break;
@@ -3611,16 +3603,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
- case TCP_USER_TIMEOUT:
- /* Cap the max time in ms TCP will retry or probe the window
- * before giving up and aborting (ETIMEDOUT) a connection.
- */
- if (val < 0)
- err = -EINVAL;
- else
- icsk->icsk_user_timeout = val;
- break;
-
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
@@ -3656,13 +3638,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (!tp->repair)
err = -EPERM;
else
- tp->tsoffset = val - tcp_time_stamp_raw();
+ WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
break;
case TCP_REPAIR_WINDOW:
err = tcp_repair_set_window(tp, optval, optlen);
break;
case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
+ WRITE_ONCE(tp->notsent_lowat, val);
sk->sk_write_space(sk);
break;
case TCP_INQ:
@@ -3674,7 +3656,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_TX_DELAY:
if (val)
tcp_enable_tx_delay();
- tp->tcp_tx_delay = val;
+ WRITE_ONCE(tp->tcp_tx_delay, val);
break;
default:
err = -ENOPROTOOPT;
@@ -3991,17 +3973,18 @@ int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? :
+ val = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
break;
case TCP_LINGER2:
- val = tp->linger2;
+ val = READ_ONCE(tp->linger2);
if (val >= 0)
val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
- TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -4138,11 +4121,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_USER_TIMEOUT:
- val = icsk->icsk_user_timeout;
+ val = READ_ONCE(icsk->icsk_user_timeout);
break;
case TCP_FASTOPEN:
- val = icsk->icsk_accept_queue.fastopenq.max_qlen;
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
break;
case TCP_FASTOPEN_CONNECT:
@@ -4154,14 +4137,14 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_TX_DELAY:
- val = tp->tcp_tx_delay;
+ val = READ_ONCE(tp->tcp_tx_delay);
break;
case TCP_TIMESTAMP:
- val = tcp_time_stamp_raw() + tp->tsoffset;
+ val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
break;
case TCP_NOTSENT_LOWAT:
- val = tp->notsent_lowat;
+ val = READ_ONCE(tp->notsent_lowat);
break;
case TCP_INQ:
val = tp->recvmsg_inq;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 45cc7f1ca296..8ed54e7334a9 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -296,6 +296,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
static bool tcp_fastopen_queue_check(struct sock *sk)
{
struct fastopen_queue *fastopenq;
+ int max_qlen;
/* Make sure the listener has enabled fastopen, and we don't
* exceed the max # of pending TFO requests allowed before trying
@@ -308,10 +309,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* temporarily vs a server not supporting Fast Open at all.
*/
fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (fastopenq->max_qlen == 0)
+ max_qlen = READ_ONCE(fastopenq->max_qlen);
+ if (max_qlen == 0)
return false;
- if (fastopenq->qlen >= fastopenq->max_qlen) {
+ if (fastopenq->qlen >= max_qlen) {
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
@@ -449,7 +451,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
if (tp->fastopen_connect && !tp->fastopen_req) {
if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
- inet_sk(sk)->defer_connect = 1;
+ inet_set_bit(DEFER_CONNECT, sk);
return true;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57c8af1859c1..06fe1cf645d5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
+ /* Note: divides are still a bit expensive.
+ * For the moment, only adjust scaling_ratio
+ * when we update icsk_ack.rcv_mss.
+ */
+ if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
+ u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+
+ do_div(val, skb->truesize);
+ tcp_sk(sk)->scaling_ratio = val ? val : 1;
+ }
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
/* Account for possibly-removed options */
@@ -287,7 +297,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.quick = quickacks;
}
-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -295,7 +305,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
-EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -727,8 +736,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvmem, rcvbuf;
u64 rcvwin, grow;
+ int rcvbuf;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
@@ -740,12 +749,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
do_div(grow, tp->rcvq_space.space);
rcvwin += (grow << 1);
- rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
- rcvmem += 128;
-
- do_div(rcvwin, tp->advmss);
- rcvbuf = min_t(u64, rcvwin * rcvmem,
+ rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
@@ -3521,7 +3525,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
{
return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
+ (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
}
/* If we update tp->snd_una, also update tp->bytes_acked */
@@ -4122,9 +4126,8 @@ void tcp_parse_options(const struct net *net,
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
- /*
- * The MD5 Hash has already been
- * checked (see tcp_v{4,6}_do_rcv()).
+ /* The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_rcv()).
*/
break;
#endif
@@ -4308,10 +4311,16 @@ static inline bool tcp_paws_discard(const struct sock *sk,
* (borrowed from freebsd)
*/
-static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
+ u32 seq, u32 end_seq)
{
- return !before(end_seq, tp->rcv_wup) &&
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+ if (before(end_seq, tp->rcv_wup))
+ return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
+
+ if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+ return SKB_NOT_DROPPED_YET;
}
/* When we get a reset we do this. */
@@ -5050,13 +5059,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
/* Ok. In sequence. In window. */
queue_and_out:
- if (skb_queue_len(&sk->sk_receive_queue) == 0)
- sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
- reason = SKB_DROP_REASON_PROTO_MEM;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ /* TODO: maybe ratelimit these WIN 0 ACK ? */
+ inet_csk(sk)->icsk_ack.pending |=
+ (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+ inet_csk_schedule_ack(sk);
sk->sk_data_ready(sk);
- goto drop;
+
+ if (skb_queue_len(&sk->sk_receive_queue)) {
+ reason = SKB_DROP_REASON_PROTO_MEM;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+ goto drop;
+ }
+ sk_forced_mem_schedule(sk, skb->truesize);
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
@@ -5734,7 +5749,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
/* Step 1: check sequence number */
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ if (reason) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
* And page 69: "If an incoming segment is not acceptable,
@@ -5751,7 +5767,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
} else if (tcp_reset_check(sk, skb)) {
goto reset;
}
- SKB_DR_SET(reason, TCP_INVALID_SEQUENCE);
goto discard;
}
@@ -6315,7 +6330,7 @@ consume:
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
- icsk->icsk_accept_queue.rskq_defer_accept ||
+ READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
@@ -6615,7 +6630,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
@@ -6985,7 +7000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
- inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
+ inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fd365de4d5ff..27140e5cdc06 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -57,6 +57,7 @@
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/sched.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
@@ -307,11 +308,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
usin->sin_port));
- tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
- inet->inet_daddr);
+ WRITE_ONCE(tp->tsoffset,
+ secure_tcp_ts_off(net, inet->inet_saddr,
+ inet->inet_daddr));
}
- inet->inet_id = get_random_u16();
+ atomic_set(&inet->inet_id, get_random_u16());
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -475,7 +477,6 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
struct tcp_sock *tp;
- struct inet_sock *inet;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct sock *sk;
@@ -623,8 +624,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
* --ANK (980905)
*/
- inet = inet_sk(sk);
- if (!sock_owned_by_user(sk) && inet->recverr) {
+ if (!sock_owned_by_user(sk) &&
+ inet_test_bit(RECVERR, sk)) {
WRITE_ONCE(sk->sk_err, err);
sk_error_report(sk);
} else { /* Only an error on timeout */
@@ -930,9 +931,9 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk = this_cpu_read(ipv4_tcp_sk);
sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_priority : sk->sk_priority;
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -988,11 +989,12 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
- req->ts_recent,
+ READ_ONCE(req->ts_recent),
0,
tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
- ip_hdr(skb)->tos, tcp_rsk(req)->txhash);
+ ip_hdr(skb)->tos,
+ READ_ONCE(tcp_rsk(req)->txhash));
}
/*
@@ -1594,7 +1596,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = get_random_u16();
+ atomic_set(&newinet->inet_id, get_random_u16());
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
@@ -2446,6 +2448,8 @@ static void *established_get_first(struct seq_file *seq)
struct hlist_nulls_node *node;
spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
+ cond_resched();
+
/* Lockless fast path for the common case of empty buckets */
if (empty_bucket(hinfo, st))
continue;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 82f4575f9cd9..c196759f1d3b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -40,7 +40,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
- possible_net_t tcpm_net;
+ struct net *tcpm_net;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
@@ -51,34 +51,38 @@ struct tcp_metrics_block {
struct rcu_head rcu_head;
};
-static inline struct net *tm_net(struct tcp_metrics_block *tm)
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
{
- return read_pnet(&tm->tcpm_net);
+ /* Paired with the WRITE_ONCE() in tcpm_new() */
+ return READ_ONCE(tm->tcpm_net);
}
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_lock & (1 << idx);
+ /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
+ return READ_ONCE(tm->tcpm_lock) & (1 << idx);
}
-static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_vals[idx];
+ /* Paired with WRITE_ONCE() in tcp_metric_set() */
+ return READ_ONCE(tm->tcpm_vals[idx]);
}
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
- tm->tcpm_vals[idx] = val;
+ /* Paired with READ_ONCE() in tcp_metric_get() */
+ WRITE_ONCE(tm->tcpm_vals[idx], val);
}
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- return inetpeer_addr_cmp(a, b) == 0;
+ return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
}
struct tcpm_hash_bucket {
@@ -89,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
static unsigned int tcp_metrics_hash_log __read_mostly;
static DEFINE_SPINLOCK(tcp_metrics_lock);
+static DEFINE_SEQLOCK(fastopen_seqlock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
@@ -97,7 +102,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
u32 msval;
u32 val;
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
val = 0;
if (dst_metric_locked(dst, RTAX_RTT))
@@ -110,30 +115,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
val |= 1 << TCP_METRIC_CWND;
if (dst_metric_locked(dst, RTAX_REORDERING))
val |= 1 << TCP_METRIC_REORDERING;
- tm->tcpm_lock = val;
+ /* Paired with READ_ONCE() in tcp_metric_locked() */
+ WRITE_ONCE(tm->tcpm_lock, val);
msval = dst_metric_raw(dst, RTAX_RTT);
- tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+ tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
msval = dst_metric_raw(dst, RTAX_RTTVAR);
- tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
- tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
- tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
- tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ dst_metric_raw(dst, RTAX_SSTHRESH));
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ dst_metric_raw(dst, RTAX_CWND));
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ dst_metric_raw(dst, RTAX_REORDERING));
if (fastopen_clear) {
+ write_seqlock(&fastopen_seqlock);
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
tm->tcpm_fastopen.try_exp = 0;
tm->tcpm_fastopen.cookie.exp = false;
tm->tcpm_fastopen.cookie.len = 0;
+ write_sequnlock(&fastopen_seqlock);
}
}
#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst)
{
- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ unsigned long limit;
+
+ if (!tm)
+ return;
+ limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
+ if (unlikely(time_after(jiffies, limit)))
tcpm_suck_dst(tm, dst, false);
}
@@ -174,20 +191,23 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
oldest = deref_locked(tcp_metrics_hash[hash].chain);
for (tm = deref_locked(oldest->tcpm_next); tm;
tm = deref_locked(tm->tcpm_next)) {
- if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ if (time_before(READ_ONCE(tm->tcpm_stamp),
+ READ_ONCE(oldest->tcpm_stamp)))
oldest = tm;
}
tm = oldest;
} else {
- tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
goto out_unlock;
}
- write_pnet(&tm->tcpm_net, net);
+ /* Paired with the READ_ONCE() in tm_net() */
+ WRITE_ONCE(tm->tcpm_net, net);
+
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
- tcpm_suck_dst(tm, dst, true);
+ tcpm_suck_dst(tm, dst, reclaim);
if (likely(!reclaim)) {
tm->tcpm_next = tcp_metrics_hash[hash].chain;
@@ -434,7 +454,7 @@ void tcp_update_metrics(struct sock *sk)
tp->reordering);
}
}
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
out_unlock:
rcu_read_unlock();
}
@@ -539,8 +559,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
return ret;
}
-static DEFINE_SEQLOCK(fastopen_seqlock);
-
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
@@ -647,7 +665,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
}
if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
- jiffies - tm->tcpm_stamp,
+ jiffies - READ_ONCE(tm->tcpm_stamp),
TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
@@ -658,7 +676,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
if (!nest)
goto nla_put_failure;
for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
- u32 val = tm->tcpm_vals[i];
+ u32 val = tcp_metric_get(tm, i);
if (!val)
continue;
@@ -972,7 +990,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = {
.resv_start_op = TCP_METRICS_CMD_DEL + 1,
};
-static unsigned int tcpmhash_entries;
+static unsigned int tcpmhash_entries __initdata;
static int __init set_tcpmhash_entries(char *str)
{
ssize_t ret;
@@ -988,15 +1006,11 @@ static int __init set_tcpmhash_entries(char *str)
}
__setup("tcpmhash_entries=", set_tcpmhash_entries);
-static int __net_init tcp_net_metrics_init(struct net *net)
+static void __init tcp_metrics_hash_alloc(void)
{
+ unsigned int slots = tcpmhash_entries;
size_t size;
- unsigned int slots;
- if (!net_eq(net, &init_net))
- return 0;
-
- slots = tcpmhash_entries;
if (!slots) {
if (totalram_pages() >= 128 * 1024)
slots = 16 * 1024;
@@ -1009,9 +1023,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
if (!tcp_metrics_hash)
- return -ENOMEM;
-
- return 0;
+ panic("Could not allocate the tcp_metrics hash table\n");
}
static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
@@ -1020,7 +1032,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
- .init = tcp_net_metrics_init,
.exit_batch = tcp_net_metrics_exit_batch,
};
@@ -1028,9 +1039,11 @@ void __init tcp_metrics_init(void)
{
int ret;
+ tcp_metrics_hash_alloc();
+
ret = register_pernet_subsys(&tcp_net_metrics_ops);
if (ret < 0)
- panic("Could not allocate the tcp_metrics hash table\n");
+ panic("Could not register tcp_net_metrics_ops\n");
ret = genl_register_family(&tcp_metrics_nl_family);
if (ret < 0)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 04fc328727e6..b98d476f1594 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -289,9 +289,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
- struct inet_sock *inet = inet_sk(sk);
- tw->tw_transparent = inet->transparent;
+ tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
tw->tw_mark = sk->sk_mark;
tw->tw_priority = sk->sk_priority;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -528,7 +527,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
newtp->lsndtime = tcp_jiffies32;
- newsk->sk_txhash = treq->txhash;
+ newsk->sk_txhash = READ_ONCE(treq->txhash);
newtp->total_retrans = req->num_retrans;
tcp_init_xmit_timers(newsk);
@@ -555,7 +554,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->max_window = newtp->snd_wnd;
if (newtp->rx_opt.tstamp_ok) {
- newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
@@ -570,8 +569,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
- if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req)))
- newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
@@ -619,7 +616,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = req->ts_recent;
+ tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
if (tmp_opt.rcv_tsecr)
tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
/* We do not store true stamp, but it is not required,
@@ -758,8 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* In sequence, PAWS is OK. */
+ /* TODO: We probably should defer ts_recent change once
+ * we take ownership of @req.
+ */
if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
- req->ts_recent = tmp_opt.rcv_tsval;
+ WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
@@ -791,7 +791,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
- if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2cb39b6dad02..e6b4fbd642f7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window);
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 old_win = tp->rcv_wnd;
- u32 cur_win = tcp_receive_window(tp);
- u32 new_win = __tcp_select_window(sk);
struct net *net = sock_net(sk);
+ u32 old_win = tp->rcv_wnd;
+ u32 cur_win, new_win;
+
+ /* Make the window 0 if we failed to queue the data because we
+ * are out of memory. The window is temporary, so we don't store
+ * it on the socket.
+ */
+ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
+ return 0;
+ cur_win = tcp_receive_window(tp);
+ new_win = __tcp_select_window(sk);
if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
@@ -878,7 +886,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
- opts->tsecr = req->ts_recent;
+ opts->tsecr = READ_ONCE(req->ts_recent);
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
@@ -1293,14 +1301,21 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- /* if no packet is in qdisc/device queue, then allow XPS to select
- * another queue. We can be called from tcp_tsq_handler()
- * which holds one reference to sk.
- *
- * TODO: Ideally, in-flight pure ACK packets should not matter here.
- * One way to get this would be to set skb->truesize = 2 on them.
+ /* We set skb->ooo_okay to one if this packet can select
+ * a different TX queue than prior packets of this flow,
+ * to avoid self inflicted reorders.
+ * The 'other' queue decision is based on current cpu number
+ * if XPS is enabled, or sk->sk_txhash otherwise.
+ * We can switch to another (and better) queue if:
+ * 1) No packet with payload is in qdisc/device queues.
+ * Delays in TX completion can defeat the test
+ * even if packets were already sent.
+ * 2) Or rtx queue is empty.
+ * This mitigates above case if ACK packets for
+ * all prior packets were already processed.
*/
- skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
+ tcp_rtx_queue_empty(sk);
/* If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
@@ -3660,7 +3675,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
- skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
/* bpf program will be interested in the tcp_flags */
TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
@@ -3741,11 +3756,6 @@ static void tcp_connect_init(struct sock *sk)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk))
- tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
-#endif
-
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->rx_opt.user_mss)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
@@ -4210,7 +4220,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
/* Paired with WRITE_ONCE() in sock_setsockopt() */
if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
- tcp_rsk(req)->txhash = net_tx_rndhash();
+ WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 470f581eedd4..984ab4a0421e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -26,14 +26,15 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- u32 elapsed, start_ts;
+ u32 elapsed, start_ts, user_timeout;
s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (!icsk->icsk_user_timeout)
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
- remaining = icsk->icsk_user_timeout - elapsed;
+ remaining = user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
@@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- u32 remaining;
+ u32 remaining, user_timeout;
s32 elapsed;
- if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout || !icsk->icsk_probes_tstamp)
return when;
elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0))
elapsed = 0;
- remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
+ remaining = msecs_to_jiffies(user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
return min_t(u32, remaining, when);
@@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
__dst_negative_advice(sk);
- retry_until = icsk->icsk_syn_retries ? :
+ /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
+ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
max_retransmits = retry_until;
@@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (!expired)
expired = retransmits_timed_out(sk, retry_until,
- icsk->icsk_user_timeout);
+ READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
@@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- if (!icsk->icsk_probes_tstamp)
+ if (!icsk->icsk_probes_tstamp) {
icsk->icsk_probes_tstamp = tcp_jiffies32;
- else if (icsk->icsk_user_timeout &&
- (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
- msecs_to_jiffies(icsk->icsk_user_timeout))
- goto abort;
+ } else {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (user_timeout &&
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+ msecs_to_jiffies(user_timeout))
+ goto abort;
+ }
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
@@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->rsk_ops->syn_ack_timeout(req);
- /* add one more retry for fastopen */
- max_retries = icsk->icsk_syn_retries ? :
+ /* Add one more retry for fastopen.
+ * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
+ */
+ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
if (req->num_timeout >= max_retries) {
@@ -446,6 +454,22 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->timeout << req->num_timeout, TCP_RTO_MAX);
}
+static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const int timeout = TCP_RTO_MAX * 2;
+ u32 rcv_delta, rtx_delta;
+
+ rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
+ if (rcv_delta <= timeout)
+ return false;
+
+ rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
+ (tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
+
+ return rtx_delta > timeout;
+}
/**
* tcp_retransmit_timer() - The TCP retransmit timeout handler
@@ -495,23 +519,26 @@ void tcp_retransmit_timer(struct sock *sk)
* we cannot allow such beasts to hang infinitely.
*/
struct inet_sock *inet = inet_sk(sk);
+ u32 rtx_delta;
+
+ rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb));
if (sk->sk_family == AF_INET) {
- net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &inet->inet_daddr,
- ntohs(inet->inet_dport),
- inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &sk->sk_v6_daddr,
- ntohs(inet->inet_dport),
- inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
}
#endif
- if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) {
+ if (tcp_rtx_probe0_timed_out(sk, skb)) {
tcp_write_err(sk);
goto out;
}
@@ -591,7 +618,9 @@ out_reset_timer:
tcp_stream_is_thin(tp) &&
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0;
- icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ icsk->icsk_rto = clamp(__tcp_set_rto(tp),
+ tcp_rto_min(sk),
+ TCP_RTO_MAX);
} else if (sk->sk_state != TCP_SYN_SENT ||
icsk->icsk_backoff >
READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
@@ -706,7 +735,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
- if (tp->linger2 >= 0) {
+ if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
@@ -731,13 +760,15 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
- if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
+ if ((user_timeout != 0 &&
+ elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) ||
- (icsk->icsk_user_timeout == 0 &&
+ (user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 42a96b3547c9..0794a2c46a56 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,6 +114,7 @@
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
#include <net/udp_tunnel.h>
+#include <net/gro.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
@@ -406,9 +407,9 @@ static int compute_score(struct sock *sk, struct net *net,
return score;
}
-static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+INDIRECT_CALLABLE_SCOPE
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+ const __be32 faddr, const __be16 fport)
{
static u32 udp_ehash_secret __read_mostly;
@@ -418,22 +419,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
- struct sk_buff *skb,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum)
-{
- struct sock *reuse_sk = NULL;
- u32 hash;
-
- if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
- hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
- reuse_sk = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- }
- return reuse_sk;
-}
-
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
@@ -451,42 +436,36 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- result = lookup_reuseport(net, sk, skb,
- saddr, sport, daddr, hnum);
+ badness = score;
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ result = sk;
+ continue;
+ }
+
+ result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, udp_ehashfn);
+ if (!result) {
+ result = sk;
+ continue;
+ }
+
/* Fall back to scoring if group has connections */
- if (result && !reuseport_has_conns(sk))
+ if (!reuseport_has_conns(sk))
return result;
- result = result ? : sk;
- badness = score;
+ /* Reuseport logic returned an error, keep original score. */
+ if (IS_ERR(result))
+ continue;
+
+ badness = compute_score(result, net, saddr, sport,
+ daddr, hnum, dif, sdif);
+
}
}
return result;
}
-static struct sock *udp4_lookup_run_bpf(struct net *net,
- struct udp_table *udptable,
- struct sk_buff *skb,
- __be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum, const int dif)
-{
- struct sock *sk, *reuse_sk;
- bool no_reuseport;
-
- if (udptable != net->ipv4.udp_table)
- return NULL; /* only UDP is supported */
-
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
- daddr, hnum, dif, &sk);
- if (no_reuseport || IS_ERR_OR_NULL(sk))
- return sk;
-
- reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
- if (reuse_sk)
- sk = reuse_sk;
- return sk;
-}
-
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
@@ -511,9 +490,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
goto done;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
- sk = udp4_lookup_run_bpf(net, udptable, skb,
- saddr, sport, daddr, hnum, dif);
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ udptable == net->ipv4.udp_table) {
+ sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, dif,
+ udp_ehashfn);
if (sk) {
result = sk;
goto done;
@@ -555,10 +536,13 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
struct net *net = dev_net(skb->dev);
+ int iif, sdif;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
return __udp4_lib_lookup(net, iph->saddr, sport,
- iph->daddr, dport, inet_iif(skb),
- inet_sdif(skb), net->ipv4.udp_table, NULL);
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
}
/* Must be called under rcu_read_lock().
@@ -795,7 +779,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
(u8 *)(uh+1));
goto out;
}
- if (!inet->recverr) {
+ if (!inet_test_bit(RECVERR, sk)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else
@@ -978,7 +962,8 @@ csum_partial:
send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
- if (err == -ENOBUFS && !inet->recverr) {
+ if (err == -ENOBUFS &&
+ !inet_test_bit(RECVERR, sk)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
@@ -1553,7 +1538,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock(&list->lock);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
busylock_release(busy);
return 0;
@@ -1886,7 +1871,7 @@ try_again:
if (udp_sk(sk)->gro_enabled)
udp_cmsg_recv(msg, sk, skb);
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
@@ -2408,7 +2393,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb, &refcounted);
+ sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+ &refcounted, udp_ehashfn);
+ if (IS_ERR(sk))
+ goto no_sk;
+
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2429,7 +2418,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk)
return udp_unicast_rcv_skb(sk, skb, uh);
-
+no_sk:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset_ct(skb);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 75aa4de5b731..0f46b3c2e4ac 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -274,13 +274,20 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
__sum16 check;
__be16 newlen;
- if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
- return __udp_gso_segment_list(gso_skb, features, is_ipv6);
-
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
+ if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
+ mss);
+ return NULL;
+ }
+
+ if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __udp_gso_segment_list(gso_skb, features, is_ipv6);
+
skb_pull(gso_skb, sizeof(*uh));
/* clear destructor to avoid skb_segment assigning it to tail */
@@ -388,8 +395,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;
- if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
- !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
return __udp_gso_segment(skb, features, false);
mss = skb_shinfo(skb)->gso_size;
@@ -603,10 +609,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
{
const struct iphdr *iph = skb_gro_network_header(skb);
struct net *net = dev_net(skb->dev);
+ int iif, sdif;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
return __udp4_lib_lookup(net, iph->saddr, sport,
- iph->daddr, dport, inet_iif(skb),
- inet_sdif(skb), net->ipv4.udp_table, NULL);
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
}
INDIRECT_CALLABLE_SCOPE
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 5f8104cf082d..9b18f371af0d 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
struct sock *sk = sock->sk;
/* Disable multicast loopback */
- inet_sk(sk)->mc_loop = 0;
+ inet_clear_bit(MC_LOOP, sk);
/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
inet_inc_convert_csum(sk);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 57ea394ffa8c..c33bca2c3841 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -124,22 +124,13 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
xfrm_dst_destroy(xdst);
}
-static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (!unregister)
- return;
-
- xfrm_dst_ifdown(dst, dev);
-}
-
static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
.cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
- .ifdown = xfrm4_dst_ifdown,
+ .ifdown = xfrm_dst_ifdown,
.local_out = __ip_local_out,
.gc_thresh = 32768,
};