diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 52 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 6 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 2 | ||||
-rw-r--r-- | net/ipv4/inetpeer.c | 280 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 3 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 1 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 46 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 6 |
12 files changed, 145 insertions, 284 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eae1f676f870..0600f0fbe325 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1438,11 +1438,11 @@ EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void __percpu *mib[], int offt) { unsigned long res = 0; - int i; + int i, j; for_each_possible_cpu(i) { - res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); - res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + for (j = 0; j < SNMP_ARRAY_SZ; j++) + res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); } return res; } @@ -1456,28 +1456,19 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) int cpu; for_each_possible_cpu(cpu) { - void *bhptr, *userptr; + void *bhptr; struct u64_stats_sync *syncp; - u64 v_bh, v_user; + u64 v; unsigned int start; - /* first mib used by softirq context, we must use _bh() accessors */ - bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); + bhptr = per_cpu_ptr(mib[0], cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin_bh(syncp); - v_bh = *(((u64 *) bhptr) + offt); + v = *(((u64 *) bhptr) + offt); } while (u64_stats_fetch_retry_bh(syncp, start)); - /* second mib used in USER context */ - userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); - syncp = (struct u64_stats_sync *)(userptr + syncp_offset); - do { - start = u64_stats_fetch_begin(syncp); - v_user = *(((u64 *) userptr) + offt); - } while (u64_stats_fetch_retry(syncp, start)); - - res += v_bh + v_user; + res += v; } return res; } @@ -1489,25 +1480,28 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) BUG_ON(ptr == NULL); ptr[0] = __alloc_percpu(mibsize, align); if (!ptr[0]) - goto err0; + return -ENOMEM; +#if SNMP_ARRAY_SZ == 2 ptr[1] = __alloc_percpu(mibsize, align); - if (!ptr[1]) - goto err1; + if (!ptr[1]) { + free_percpu(ptr[0]); + ptr[0] = NULL; + return -ENOMEM; + } +#endif return 0; -err1: - free_percpu(ptr[0]); - ptr[0] = NULL; -err0: - return -ENOMEM; } EXPORT_SYMBOL_GPL(snmp_mib_init); -void snmp_mib_free(void __percpu *ptr[2]) +void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) { + int i; + BUG_ON(ptr == NULL); - free_percpu(ptr[0]); - free_percpu(ptr[1]); - ptr[0] = ptr[1] = NULL; + for (i = 0; i < SNMP_ARRAY_SZ; i++) { + free_percpu(ptr[i]); + ptr[i] = NULL; + } } EXPORT_SYMBOL_GPL(snmp_mib_free); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0d4a184af16f..37b3c188d8b3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1833,8 +1833,8 @@ void __init devinet_init(void) rtnl_af_register(&inet_af_ops); - rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); - rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 22524716fe70..92fc5f69f5da 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1124,9 +1124,9 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3267d3898437..389a2e6a17fd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -869,7 +869,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, NULL); + inet_diag_dump, NULL, 0); } return inet_diag_get_exact(skb, nlh); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ce616d92cc54..dafbf2c98b28 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -54,15 +54,11 @@ * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. - * 3. Nodes appears and disappears from unused node list only under - * "inet_peer_unused_lock". - * 4. Global variable peer_total is modified under the pool lock. - * 5. struct inet_peer fields modification: + * 3. Global variable peer_total is modified under the pool lock. + * 4. struct inet_peer fields modification: * avl_left, avl_right, avl_parent, avl_height: pool lock - * unused: unused node list lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing - * dtime: unused node list lock * daddr: unchangeable * ip_id_count: atomic value (no lock needed) */ @@ -104,19 +100,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ -int inet_peer_gc_mintime __read_mostly = 10 * HZ; -int inet_peer_gc_maxtime __read_mostly = 120 * HZ; - -static struct { - struct list_head list; - spinlock_t lock; -} unused_peers = { - .list = LIST_HEAD_INIT(unused_peers.list), - .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), -}; - -static void peer_check_expire(unsigned long dummy); -static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); /* Called from ip_output.c:ip_init */ @@ -142,21 +125,6 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - peer_periodic_timer.expires = jiffies - + net_random() % inet_peer_gc_maxtime - + inet_peer_gc_maxtime; - add_timer(&peer_periodic_timer); -} - -/* Called with or without local BH being disabled. */ -static void unlink_from_unused(struct inet_peer *p) -{ - spin_lock_bh(&unused_peers.lock); - list_del_init(&p->unused); - spin_unlock_bh(&unused_peers.lock); } static int addr_compare(const struct inetpeer_addr *a, @@ -203,20 +171,6 @@ static int addr_compare(const struct inetpeer_addr *a, u; \ }) -static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) -{ - int cur, old = atomic_read(ptr); - - while (old != u) { - *newv = old + a; - cur = atomic_cmpxchg(ptr, old, *newv); - if (cur == old) - return true; - old = cur; - } - return false; -} - /* * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall @@ -225,8 +179,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) * We exit from this function if number of links exceeds PEER_MAXDEPTH */ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, - struct inet_peer_base *base, - int *newrefcnt) + struct inet_peer_base *base) { struct inet_peer *u = rcu_dereference(base->root); int count = 0; @@ -235,11 +188,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int cmp = addr_compare(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was - * deleted, unlink_from_pool() sets refcnt=-1 to make - * distinction between an unused entry (refcnt=0) and - * a freed one. + * deleted (refcnt=-1) */ - if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) + if (!atomic_add_unless(&u->refcnt, 1, -1)) u = NULL; return u; } @@ -366,137 +317,96 @@ static void inetpeer_free_rcu(struct rcu_head *head) kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } -/* May be called with local BH enabled. */ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) { - int do_free; - - do_free = 0; - - write_seqlock_bh(&base->lock); - /* Check the reference counter. It was artificially incremented by 1 - * in cleanup() function to prevent sudden disappearing. If we can - * atomically (because of lockless readers) take this last reference, - * it's safe to remove the node and free it later. - * We use refcnt=-1 to alert lockless readers this entry is deleted. - */ - if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer __rcu ***stackptr, ***delp; - if (lookup(&p->daddr, stack, base) != p) - BUG(); - delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty_rcu) { - *delp[0] = p->avl_right; - --stackptr; - } else { - /* look for a node to insert instead of p */ - struct inet_peer *t; - t = lookup_rightempty(p, base); - BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); - **--stackptr = t->avl_left; - /* t is removed, t->daddr > x->daddr for any - * x in p->avl_left subtree. - * Put t in the old place of p. */ - RCU_INIT_POINTER(*delp[0], t); - t->avl_left = p->avl_left; - t->avl_right = p->avl_right; - t->avl_height = p->avl_height; - BUG_ON(delp[1] != &p->avl_left); - delp[1] = &t->avl_left; /* was &p->avl_left */ - } - peer_avl_rebalance(stack, stackptr, base); - base->total--; - do_free = 1; + struct inet_peer __rcu ***stackptr, ***delp; + + if (lookup(&p->daddr, stack, base) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty_rcu) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p, base); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); + **--stackptr = t->avl_left; + /* t is removed, t->daddr > x->daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + RCU_INIT_POINTER(*delp[0], t); + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + BUG_ON(delp[1] != &p->avl_left); + delp[1] = &t->avl_left; /* was &p->avl_left */ } - write_sequnlock_bh(&base->lock); - - if (do_free) - call_rcu(&p->rcu, inetpeer_free_rcu); - else - /* The node is used again. Decrease the reference counter - * back. The loop "cleanup -> unlink_from_unused - * -> unlink_from_pool -> putpeer -> link_to_unused - * -> cleanup (for the same node)" - * doesn't really exist because the entry will have a - * recent deletion time and will not be cleaned again soon. - */ - inet_putpeer(p); + peer_avl_rebalance(stack, stackptr, base); + base->total--; + call_rcu(&p->rcu, inetpeer_free_rcu); } static struct inet_peer_base *family_to_base(int family) { - return (family == AF_INET ? &v4_peers : &v6_peers); + return family == AF_INET ? &v4_peers : &v6_peers; } -static struct inet_peer_base *peer_to_base(struct inet_peer *p) +/* perform garbage collect on all items stacked during a lookup */ +static int inet_peer_gc(struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH], + struct inet_peer __rcu ***stackptr) { - return family_to_base(p->daddr.family); -} - -/* May be called with local BH enabled. */ -static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) -{ - struct inet_peer *p = NULL; - - /* Remove the first entry from the list of unused nodes. */ - spin_lock_bh(&unused_peers.lock); - if (!list_empty(&unused_peers.list)) { - __u32 delta; + struct inet_peer *p, *gchead = NULL; + __u32 delta, ttl; + int cnt = 0; - p = list_first_entry(&unused_peers.list, struct inet_peer, unused); + if (base->total >= inet_peer_threshold) + ttl = 0; /* be aggressive */ + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + base->total / inet_peer_threshold * HZ; + stackptr--; /* last stack slot is peer_avl_empty */ + while (stackptr > stack) { + stackptr--; + p = rcu_deref_locked(**stackptr, base); delta = (__u32)jiffies - p->dtime; - - if (delta < ttl) { - /* Do not prune fresh entries. */ - spin_unlock_bh(&unused_peers.lock); - return -1; + if (atomic_read(&p->refcnt) == 0 && delta >= ttl && + atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { + p->gc_next = gchead; + gchead = p; } - - list_del_init(&p->unused); - - /* Grab an extra reference to prevent node disappearing - * before unlink_from_pool() call. */ - atomic_inc(&p->refcnt); } - spin_unlock_bh(&unused_peers.lock); - - if (p == NULL) - /* It means that the total number of USED entries has - * grown over inet_peer_threshold. It shouldn't really - * happen because of entry limits in route cache. */ - return -1; - - unlink_from_pool(p, peer_to_base(p), stack); - return 0; + while ((p = gchead) != NULL) { + gchead = p->gc_next; + cnt++; + unlink_from_pool(p, base, stack); + } + return cnt; } -/* Called with or without local BH being disabled. */ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; - int invalidated, newrefcnt = 0; + int invalidated, gccnt = 0; - /* Look up for the address quickly, lockless. + /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); sequence = read_seqbegin(&base->lock); - p = lookup_rcu(daddr, base, &newrefcnt); + p = lookup_rcu(daddr, base); invalidated = read_seqretry(&base->lock, sequence); rcu_read_unlock(); - if (p) { -found: /* The existing node has been found. - * Remove the entry from unused list if it was there. - */ - if (newrefcnt == 1) - unlink_from_unused(p); + if (p) return p; - } /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) @@ -506,11 +416,17 @@ found: /* The existing node has been found. * At least, nodes should be hot in our cache. */ write_seqlock_bh(&base->lock); +relookup: p = lookup(daddr, stack, base); if (p != peer_avl_empty) { - newrefcnt = atomic_inc_return(&p->refcnt); + atomic_inc(&p->refcnt); write_sequnlock_bh(&base->lock); - goto found; + return p; + } + if (!gccnt) { + gccnt = inet_peer_gc(base, stack, stackptr); + if (gccnt && create) + goto relookup; } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { @@ -525,7 +441,6 @@ found: /* The existing node has been found. p->pmtu_expires = 0; p->pmtu_orig = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - INIT_LIST_HEAD(&p->unused); /* Link the node. */ @@ -534,63 +449,14 @@ found: /* The existing node has been found. } write_sequnlock_bh(&base->lock); - if (base->total >= inet_peer_threshold) - /* Remove one less-recently-used entry. */ - cleanup_once(0, stack); - return p; } - -static int compute_total(void) -{ - return v4_peers.total + v6_peers.total; -} EXPORT_SYMBOL_GPL(inet_getpeer); -/* Called with local BH disabled. */ -static void peer_check_expire(unsigned long dummy) -{ - unsigned long now = jiffies; - int ttl, total; - struct inet_peer __rcu **stack[PEER_MAXDEPTH]; - - total = compute_total(); - if (total >= inet_peer_threshold) - ttl = inet_peer_minttl; - else - ttl = inet_peer_maxttl - - (inet_peer_maxttl - inet_peer_minttl) / HZ * - total / inet_peer_threshold * HZ; - while (!cleanup_once(ttl, stack)) { - if (jiffies != now) - break; - } - - /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime - * interval depending on the total number of entries (more entries, - * less interval). */ - total = compute_total(); - if (total >= inet_peer_threshold) - peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; - else - peer_periodic_timer.expires = jiffies - + inet_peer_gc_maxtime - - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - total / inet_peer_threshold * HZ; - add_timer(&peer_periodic_timer); -} - void inet_putpeer(struct inet_peer *p) { - local_bh_disable(); - - if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { - list_add_tail(&p->unused, &unused_peers.list); - p->dtime = (__u32)jiffies; - spin_unlock(&unused_peers.lock); - } - - local_bh_enable(); + p->dtime = (__u32)jiffies; + atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 30a7763c400e..aae2bd8cd924 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2544,7 +2544,8 @@ int __init ip_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, + NULL, ipmr_rtm_dumproute, NULL); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/route.c b/net/ipv4/route.c index aa13ef105110..f24c3359e5d0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3303,7 +3303,7 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(ip_rt_max_size); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 26461492a847..92bb9434b338 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -316,6 +316,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 57d0752e239a..69fd7201129a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -398,20 +398,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "inet_peer_gc_mintime", - .data = &inet_peer_gc_mintime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "inet_peer_gc_maxtime", - .data = &inet_peer_gc_maxtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "tcp_orphan_retries", .data = &sysctl_tcp_orphan_retries, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bef9f04c22ba..ea0d2183df4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -880,6 +880,11 @@ static void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; + } else { + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { @@ -887,10 +892,7 @@ static void tcp_init_metrics(struct sock *sk) tp->reordering = dst_metric(dst, RTAX_REORDERING); } - if (dst_metric(dst, RTAX_RTT) == 0) - goto reset; - - if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -916,19 +918,26 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); - if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { reset: - /* Play conservative. If timestamps are not - * supported, TCP will fail to recalculate correct - * rtt, if initial rto is too small. FORGET ALL AND RESET! + if (tp->srtt == 0) { + /* RFC2988bis: We've failed to get a valid RTT sample from + * 3WHS. This is most likely due to retransmission, + * including spurious one. Reset the RTO back to 3secs + * from the more aggressive 1sec to avoid more spurious + * retransmission. */ - if (!tp->rx_opt.saw_tstamp && tp->srtt) { - tp->srtt = 0; - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; - } + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } - tp->snd_cwnd = tcp_init_cwnd(tp, dst); + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC2988bis' more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -3112,12 +3121,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tcp_xmit_retransmit_queue(sk); } -static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) { tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; } +EXPORT_SYMBOL(tcp_valid_rtt_meas); /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Supersedes RFC1323) @@ -5806,12 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - /* tcp_ack considers this ACK as duplicate - * and does not calculate rtt. - * Force it here. - */ - tcp_ack_update_rtt(sk, 0, 0); - if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 708dc203b034..955b8e65b69e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -429,8 +429,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << - icsk->icsk_backoff; + inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT) << icsk->icsk_backoff; tcp_bound_rto(sk); skb = tcp_write_queue_head(sk); @@ -1384,6 +1384,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; if (tcp_v4_send_synack(sk, dst, req, (struct request_values *)&tmp_ext) || @@ -1458,6 +1459,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); + if (tcp_rsk(req)->snt_synack) + tcp_valid_rtt_meas(newsk, + tcp_time_stamp - tcp_rsk(req)->snt_synack); + newtp->total_retrans = req->retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ @@ -1855,7 +1860,7 @@ static int tcp_v4_init_sock(struct sock *sk) * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = 2; + tp->snd_cwnd = TCP_INIT_CWND; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 80b1f80759ab..d2fe4e06b472 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - newtp->snd_cwnd = 2; + newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; newtp->bytes_acked = 0; @@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } + if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) + tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; + else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ + tcp_rsk(req)->snt_synack = 0; /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all |