summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-10-29 02:11:14 -0700
committerDavid S. Miller <davem@davemloft.net>2008-10-29 02:11:14 -0700
commit271b72c7fa82c2c7a795bc16896149933110672d (patch)
tree5634b95c04b4a7ac9babf2d8ac34cfb6c38a8f83 /net/ipv4
parent645ca708f936b2fbeb79e52d7823e3eb2c0905f8 (diff)
udp: RCU handling for Unicast packets.
Goals are : 1) Optimizing handling of incoming Unicast UDP frames, so that no memory writes should happen in the fast path. Note: Multicasts and broadcasts still will need to take a lock, because doing a full lockless lookup in this case is difficult. 2) No expensive operations in the socket bind/unhash phases : - No expensive synchronize_rcu() calls. - No added rcu_head in socket structure, increasing memory needs, but more important, forcing us to use call_rcu() calls, that have the bad property of making sockets structure cold. (rcu grace period between socket freeing and its potential reuse make this socket being cold in CPU cache). David did a previous patch using call_rcu() and noticed a 20% impact on TCP connection rates. Quoting Cristopher Lameter : "Right. That results in cacheline cooldown. You'd want to recycle the object as they are cache hot on a per cpu basis. That is screwed up by the delayed regular rcu processing. We have seen multiple regressions due to cacheline cooldown. The only choice in cacheline hot sensitive areas is to deal with the complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU." - Because udp sockets are allocated from dedicated kmem_cache, use of SLAB_DESTROY_BY_RCU can help here. Theory of operation : --------------------- As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()), special attention must be taken by readers and writers. Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed, reused, inserted in a different chain or in worst case in the same chain while readers could do lookups in the same time. In order to avoid loops, a reader must check each socket found in a chain really belongs to the chain the reader was traversing. If it finds a mismatch, lookup must start again at the begining. This *restart* loop is the reason we had to use rdlock for the multicast case, because we dont want to send same message several times to the same socket. We use RCU only for fast path. Thus, /proc/net/udp still takes spinlocks. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/udp.c35
-rw-r--r--net/ipv4/udplite.c1
2 files changed, 27 insertions, 9 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2a6c491f97d7..0ea974bf7962 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
inet_sk(sk)->num = snum;
sk->sk_hash = snum;
if (sk_unhashed(sk)) {
- sk_add_node(sk, &hslot->head);
+ sk_add_node_rcu(sk, &hslot->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
}
error = 0;
@@ -253,15 +253,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
- struct sock *sk, *result = NULL;
+ struct sock *sk, *result;
struct hlist_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
- int score, badness = -1;
+ int score, badness;
- spin_lock(&hslot->lock);
- sk_for_each(sk, node, &hslot->head) {
+ rcu_read_lock();
+begin:
+ result = NULL;
+ badness = -1;
+ sk_for_each_rcu(sk, node, &hslot->head) {
+ /*
+ * lockless reader, and SLAB_DESTROY_BY_RCU items:
+ * We must check this item was not moved to another chain
+ */
+ if (udp_hashfn(net, sk->sk_hash) != hash)
+ goto begin;
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
@@ -269,9 +278,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
badness = score;
}
}
- if (result)
- sock_hold(result);
- spin_unlock(&hslot->lock);
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+ daddr, dport, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
return result;
}
@@ -953,7 +969,7 @@ void udp_lib_unhash(struct sock *sk)
struct udp_hslot *hslot = &udptable->hash[hash];
spin_lock(&hslot->lock);
- if (sk_del_node_init(sk)) {
+ if (sk_del_node_init_rcu(sk)) {
inet_sk(sk)->num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
@@ -1517,6 +1533,7 @@ struct proto udp_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index d8ea8e5f5ea3..c784891cb7e5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,6 +51,7 @@ struct proto udplite_prot = {
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
.obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
.h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,