summaryrefslogtreecommitdiff
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c1329
1 files changed, 212 insertions, 1117 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d547f6fae20d..9add08869c75 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
-static int rt_chain_length_max __read_mostly = 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
/*
* Interface to generic destination cache.
@@ -145,14 +141,12 @@ static unsigned long expires_ljiffies;
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
static unsigned int ipv4_mtu(const struct dst_entry *dst);
-static void ipv4_dst_destroy(struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
-static int rt_garbage_collect(struct dst_ops *ops);
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
@@ -172,12 +166,10 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
- .gc = rt_garbage_collect,
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
- .destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
@@ -209,184 +201,30 @@ const __u8 ip_tos2prio[16] = {
};
EXPORT_SYMBOL(ip_tos2prio);
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- * as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- * they do so with atomic increments and with the
- * lock held.
- */
-
-struct rt_hash_bucket {
- struct rtable __rcu *chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
- defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ 256
-#else
-# if NR_CPUS >= 32
-# define RT_HASH_LOCK_SZ 4096
-# elif NR_CPUS >= 16
-# define RT_HASH_LOCK_SZ 2048
-# elif NR_CPUS >= 8
-# define RT_HASH_LOCK_SZ 1024
-# elif NR_CPUS >= 4
-# define RT_HASH_LOCK_SZ 512
-# else
-# define RT_HASH_LOCK_SZ 256
-# endif
-#endif
-
-static spinlock_t *rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
- int i;
-
- rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
- GFP_KERNEL);
- if (!rt_hash_locks)
- panic("IP: failed to allocate rt_hash_locks\n");
-
- for (i = 0; i < RT_HASH_LOCK_SZ; i++)
- spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket *rt_hash_table __read_mostly;
-static unsigned int rt_hash_mask __read_mostly;
-static unsigned int rt_hash_log __read_mostly;
-
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
- int genid)
-{
- return jhash_3words((__force u32)daddr, (__force u32)saddr,
- idx, genid)
- & rt_hash_mask;
-}
-
static inline int rt_genid(struct net *net)
{
return atomic_read(&net->ipv4.rt_genid);
}
#ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
- struct seq_net_private p;
- int bucket;
- int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
- struct rt_cache_iter_state *st = seq->private;
- struct rtable *r = NULL;
-
- for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
- if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
- continue;
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- while (r) {
- if (dev_net(r->dst.dev) == seq_file_net(seq) &&
- r->rt_genid == st->genid)
- return r;
- r = rcu_dereference_bh(r->dst.rt_next);
- }
- rcu_read_unlock_bh();
- }
- return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
-
- r = rcu_dereference_bh(r->dst.rt_next);
- while (!r) {
- rcu_read_unlock_bh();
- do {
- if (--st->bucket < 0)
- return NULL;
- } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
- while ((r = __rt_cache_get_next(seq, r)) != NULL) {
- if (dev_net(r->dst.dev) != seq_file_net(seq))
- continue;
- if (r->rt_genid == st->genid)
- break;
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct rtable *r = rt_cache_get_first(seq);
-
- if (r)
- while (pos && (r = rt_cache_get_next(seq, r)))
- --pos;
- return pos ? NULL : r;
-}
-
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct rt_cache_iter_state *st = seq->private;
if (*pos)
- return rt_cache_get_idx(seq, *pos - 1);
- st->genid = rt_genid(seq_file_net(seq));
+ return NULL;
return SEQ_START_TOKEN;
}
static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct rtable *r;
-
- if (v == SEQ_START_TOKEN)
- r = rt_cache_get_first(seq);
- else
- r = rt_cache_get_next(seq, v);
++*pos;
- return r;
+ return NULL;
}
static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
- if (v && v != SEQ_START_TOKEN)
- rcu_read_unlock_bh();
}
static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
"HHUptod\tSpecDst");
- else {
- struct rtable *r = v;
- int len;
-
- seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- dst_metric_advmss(&r->dst) + 40,
- dst_metric(&r->dst, RTAX_WINDOW), 0,
- r->rt_key_tos,
- -1, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
- }
return 0;
}
@@ -426,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {
static int rt_cache_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &rt_cache_seq_ops,
- sizeof(struct rt_cache_iter_state));
+ return seq_open(file, &rt_cache_seq_ops);
}
static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {
.open = rt_cache_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_net,
+ .release = seq_release,
};
@@ -625,263 +444,12 @@ static inline int ip_rt_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
- ip_rt_put(rt);
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
- /* Kill broadcast/multicast entries very aggresively, if they
- collide in hash table with more useful entries */
- return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
- return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
- unsigned long age;
- int ret = 0;
-
- if (atomic_read(&rth->dst.__refcnt))
- goto out;
-
- age = jiffies - rth->dst.lastuse;
- if ((age <= tmo1 && !rt_fast_clean(rth)) ||
- (age <= tmo2 && rt_valuable(rth)))
- goto out;
- ret = 1;
-out: return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
- u32 score = jiffies - rt->dst.lastuse;
-
- score = ~score & ~(3<<30);
-
- if (rt_valuable(rt))
- score |= (1<<31);
-
- if (rt_is_output_route(rt) ||
- !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
- score |= (1<<30);
-
- return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
- return net->ipv4.current_rt_cache_rebuild_count <=
- net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct rtable *rt1,
- const struct rtable *rt2)
-{
- return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
- ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
- (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
- return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
- ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
- (rt1->rt_mark ^ rt2->rt_mark) |
- (rt1->rt_key_tos ^ rt2->rt_key_tos) |
- (rt1->rt_route_iif ^ rt2->rt_route_iif) |
- (rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
- return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
static inline int rt_is_expired(struct rtable *rth)
{
return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
}
/*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
- unsigned int i;
- struct rtable *rth, *next;
-
- for (i = 0; i <= rt_hash_mask; i++) {
- struct rtable __rcu **pprev;
- struct rtable *list;
-
- if (process_context && need_resched())
- cond_resched();
- rth = rcu_access_pointer(rt_hash_table[i].chain);
- if (!rth)
- continue;
-
- spin_lock_bh(rt_hash_lock_addr(i));
-
- list = NULL;
- pprev = &rt_hash_table[i].chain;
- rth = rcu_dereference_protected(*pprev,
- lockdep_is_held(rt_hash_lock_addr(i)));
-
- while (rth) {
- next = rcu_dereference_protected(rth->dst.rt_next,
- lockdep_is_held(rt_hash_lock_addr(i)));
-
- if (!net ||
- net_eq(dev_net(rth->dst.dev), net)) {
- rcu_assign_pointer(*pprev, next);
- rcu_assign_pointer(rth->dst.rt_next, list);
- list = rth;
- } else {
- pprev = &rth->dst.rt_next;
- }
- rth = next;
- }
-
- spin_unlock_bh(rt_hash_lock_addr(i));
-
- for (; list; list = next) {
- next = rcu_dereference_protected(list->dst.rt_next, 1);
- rt_free(list);
- }
- }
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- * rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
- const struct rtable *aux = head;
-
- while (aux != rth) {
- if (compare_hash_inputs(aux, rth))
- return 0;
- aux = rcu_dereference_protected(aux->dst.rt_next, 1);
- }
- return ONE;
-}
-
-static void rt_check_expire(void)
-{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth) ||
- rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
-
- /* We only count entries on a chain with equal
- * hash inputs once so that entries for
- * different QOS levels, and other non-hash
- * input attributes don't unfairly skew the
- * length computation
- */
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- length += has_noalias(rt_hash_table[i].chain, rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
-/*
* Perturbation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
* many times (2^24) without giving recent rt_genid.
@@ -902,167 +470,6 @@ static void rt_cache_invalidate(struct net *net)
void rt_cache_flush(struct net *net, int delay)
{
rt_cache_invalidate(net);
- if (delay >= 0)
- rt_do_flush(net, !in_softirq());
-}
-
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
- rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
- net_warn_ratelimited("Route hash chain too long!\n");
- rt_cache_invalidate(net);
-}
-
-/*
- Short description of GC goals.
-
- We want to build algorithm, which will keep routing cache
- at some equilibrium point, when number of aged off entries
- is kept approximately equal to newly generated ones.
-
- Current expiration strength is variable "expire".
- We try to adjust it dynamically, so that if networking
- is idle expires is large enough to keep enough of warm entries,
- and when load increases it reduces to limit cache size.
- */
-
-static int rt_garbage_collect(struct dst_ops *ops)
-{
- static unsigned long expire = RT_GC_TIMEOUT;
- static unsigned long last_gc;
- static int rover;
- static int equilibrium;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long now = jiffies;
- int goal;
- int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
- /*
- * Garbage collection is pretty expensive,
- * do not make it too frequently.
- */
-
- RT_CACHE_STAT_INC(gc_total);
-
- if (now - last_gc < ip_rt_gc_min_interval &&
- entries < ip_rt_max_size) {
- RT_CACHE_STAT_INC(gc_ignored);
- goto out;
- }
-
- entries = dst_entries_get_slow(&ipv4_dst_ops);
- /* Calculate number of entries, which we want to expire now. */
- goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
- if (goal <= 0) {
- if (equilibrium < ipv4_dst_ops.gc_thresh)
- equilibrium = ipv4_dst_ops.gc_thresh;
- goal = entries - equilibrium;
- if (goal > 0) {
- equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = entries - equilibrium;
- }
- } else {
- /* We are in dangerous area. Try to reduce cache really
- * aggressively.
- */
- goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = entries - goal;
- }
-
- if (now - last_gc >= ip_rt_gc_min_interval)
- last_gc = now;
-
- if (goal <= 0) {
- equilibrium += goal;
- goto work_done;
- }
-
- do {
- int i, k;
-
- for (i = rt_hash_mask, k = rover; i >= 0; i--) {
- unsigned long tmo = expire;
-
- k = (k + 1) & rt_hash_mask;
- rthp = &rt_hash_table[k].chain;
- spin_lock_bh(rt_hash_lock_addr(k));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
- if (!rt_is_expired(rth) &&
- !rt_may_expire(rth, tmo, expire)) {
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- continue;
- }
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- goal--;
- }
- spin_unlock_bh(rt_hash_lock_addr(k));
- if (goal <= 0)
- break;
- }
- rover = k;
-
- if (goal <= 0)
- goto work_done;
-
- /* Goal is not achieved. We stop process if:
-
- - if expire reduced to zero. Otherwise, expire is halfed.
- - if table is not full.
- - if we are called from interrupt.
- - jiffies check is just fallback/debug loop breaker.
- We will not spin here for long time in any case.
- */
-
- RT_CACHE_STAT_INC(gc_goal_miss);
-
- if (expire == 0)
- break;
-
- expire >>= 1;
-
- if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- } while (!in_softirq() && time_before_eq(jiffies, now));
-
- if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- net_warn_ratelimited("dst cache overflow\n");
- RT_CACHE_STAT_INC(gc_dst_overflow);
- return 1;
-
-work_done:
- expire += ip_rt_gc_min_interval;
- if (expire > ip_rt_gc_timeout ||
- dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
- dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
- expire = ip_rt_gc_timeout;
-out: return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
- int length = 0;
- const struct rtable *rth = head;
-
- while (rth) {
- length += has_noalias(head, rth);
- rth = rcu_dereference_protected(rth->dst.rt_next, 1);
- }
- return length >> FRACT_BITS;
}
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +493,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
- struct sk_buff *skb, int ifindex)
-{
- struct rtable *rth, *cand;
- struct rtable __rcu **rthp, **candp;
- unsigned long now;
- u32 min_score;
- int chain_length;
-
-restart:
- chain_length = 0;
- min_score = ~(u32)0;
- cand = NULL;
- candp = NULL;
- now = jiffies;
-
- if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
- /*
- * If we're not caching, just tell the caller we
- * were successful and don't touch the route. The
- * caller hold the sole reference to the cache entry, and
- * it will be released when the caller is done with it.
- * If we drop it here, the callers have no way to resolve routes
- * when we're not caching. Instead, just point *rp at rt, so
- * the caller gets a single use out of the route
- * Note that we do rt_free on this new route entry, so that
- * once its refcount hits zero, we are still able to reap it
- * (Thanks Alexey)
- * Note: To avoid expensive rcu stuff for this uncached dst,
- * we set DST_NOCACHE so that dst_release() can free dst without
- * waiting a grace period.
- */
-
- rt->dst.flags |= DST_NOCACHE;
- goto skip_hashing;
- }
-
- rthp = &rt_hash_table[hash].chain;
-
- spin_lock_bh(rt_hash_lock_addr(hash));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
- /* Put it first */
- *rthp = rth->dst.rt_next;
- /*
- * Since lookup is lockfree, the deletion
- * must be visible to another weakly ordered CPU before
- * the insertion at the start of the hash chain.
- */
- rcu_assign_pointer(rth->dst.rt_next,
- rt_hash_table[hash].chain);
- /*
- * Since lookup is lockfree, the update writes
- * must be ordered for consistency on SMP.
- */
- rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
- dst_use(&rth->dst, now);
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- rt_drop(rt);
- if (skb)
- skb_dst_set(skb, &rth->dst);
- return rth;
- }
-
- if (!atomic_read(&rth->dst.__refcnt)) {
- u32 score = rt_score(rth);
-
- if (score <= min_score) {
- cand = rth;
- candp = rthp;
- min_score = score;
- }
- }
-
- chain_length++;
-
- rthp = &rth->dst.rt_next;
- }
-
- if (cand) {
- /* ip_rt_gc_elasticity used to be average length of chain
- * length, when exceeded gc becomes really aggressive.
- *
- * The second limit is less certain. At the moment it allows
- * only 2 entries per bucket. We will see.
- */
- if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->dst.rt_next;
- rt_free(cand);
- }
- } else {
- if (chain_length > rt_chain_length_max &&
- slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
- struct net *net = dev_net(rt->dst.dev);
- int num = ++net->ipv4.current_rt_cache_rebuild_count;
- if (!rt_caching(net)) {
- pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
- rt->dst.dev->name, num);
- }
- rt_emergency_hash_rebuild(net);
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
- ifindex, rt_genid(net));
- goto restart;
- }
- }
-
- rt->dst.rt_next = rt_hash_table[hash].chain;
-
- /*
- * Since lookup is lockfree, we must make sure
- * previous writes to rt are committed to memory
- * before making rt visible to other CPUS.
- */
- rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
- if (skb)
- skb_dst_set(skb, &rt->dst);
- return rt;
-}
-
/*
* Peer allocation may fail only in serious out-of-memory conditions. However
* we still can generate some output.
@@ -1255,26 +529,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void rt_del(unsigned int hash, struct rtable *rt)
-{
- struct rtable __rcu **rthp;
- struct rtable *aux;
-
- rthp = &rt_hash_table[hash].chain;
- spin_lock_bh(rt_hash_lock_addr(hash));
- ip_rt_put(rt);
- while ((aux = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
- if (aux == rt || rt_is_expired(aux)) {
- *rthp = aux->dst.rt_next;
- rt_free(aux);
- continue;
- }
- rthp = &aux->dst.rt_next;
- }
- spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
@@ -1417,7 +671,8 @@ out_unlock:
return;
}
-static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+ bool kill_route)
{
__be32 new_gw = icmp_hdr(skb)->un.gateway;
__be32 old_gw = ip_hdr(skb)->saddr;
@@ -1472,8 +727,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
update_or_create_fnhe(nh, fl4->daddr, new_gw,
0, 0);
}
- rt->rt_gateway = new_gw;
- rt->rt_flags |= RTCF_REDIRECTED;
+ if (kill_route)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
neigh_release(n);
@@ -1504,7 +759,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
ip_rt_build_flow_key(&fl4, sk, skb);
- __ip_do_redirect(rt, skb, &fl4);
+ __ip_do_redirect(rt, skb, &fl4, true);
}
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1518,10 +773,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
rt->dst.expires) {
- unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
- rt->rt_oif,
- rt_genid(dev_net(dst->dev)));
- rt_del(hash, rt);
+ ip_rt_put(rt);
ret = NULL;
}
}
@@ -1597,7 +849,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
peer->rate_tokens == ip_rt_redirect_number)
net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
&ip_hdr(skb)->saddr, rt->rt_iif,
- &rt->rt_dst, &rt->rt_gateway);
+ &ip_hdr(skb)->daddr, &rt->rt_gateway);
#endif
}
out_put_peer:
@@ -1666,7 +918,7 @@ out: kfree_skb(skb);
return 0;
}
-static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct fib_result res;
@@ -1679,8 +931,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
jiffies + ip_rt_mtu_expires);
}
- rt->rt_pmtu = mtu;
- dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+ return mtu;
}
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -1690,7 +941,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
- __ip_rt_update_pmtu(rt, &fl4, mtu);
+ mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
+
+ if (!rt->rt_pmtu) {
+ dst->obsolete = DST_OBSOLETE_KILL;
+ } else {
+ rt->rt_pmtu = mtu;
+ dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+ }
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1736,7 +994,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
- __ip_do_redirect(rt, skb, &fl4);
+ __ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
}
}
@@ -1751,7 +1009,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
rt = __ip_route_output_key(sock_net(sk), &fl4);
if (!IS_ERR(rt)) {
- __ip_do_redirect(rt, skb, &fl4);
+ __ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
}
}
@@ -1761,22 +1019,19 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rtable *rt = (struct rtable *) dst;
- if (rt_is_expired(rt))
+ /* All IPV4 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ *
+ * When a PMTU/redirect information update invalidates a
+ * route, this is indicated by setting obsolete to
+ * DST_OBSOLETE_KILL.
+ */
+ if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
return NULL;
return dst;
}
-static void ipv4_dst_destroy(struct dst_entry *dst)
-{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt->fi) {
- fib_info_put(rt->fi);
- rt->fi = NULL;
- }
-}
-
-
static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
@@ -1832,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
else
- src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
- RT_SCOPE_UNIVERSE);
+ src = inet_select_addr(rt->dst.dev,
+ rt_nexthop(rt, iph->daddr),
+ RT_SCOPE_UNIVERSE);
rcu_read_unlock();
}
memcpy(addr, &src, 4);
@@ -1879,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = dst->dev->mtu;
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-
- if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+ if (rt->rt_gateway && mtu > 576)
mtu = 576;
}
@@ -1890,58 +1145,91 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu;
}
-static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
- struct fib_info *fi)
-{
- if (fi->fib_metrics != (u32 *) dst_default_metrics) {
- rt->fi = fi;
- atomic_inc(&fi->fib_clntref);
- }
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
-}
-
-static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
struct fib_nh_exception *fnhe;
u32 hval;
+ if (!hash)
+ return NULL;
+
hval = fnhe_hashfun(daddr);
-restart:
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) {
- __be32 fnhe_daddr, gw;
- unsigned long expires;
- unsigned int seq;
- u32 pmtu;
-
- seq = read_seqbegin(&fnhe_seqlock);
- fnhe_daddr = fnhe->fnhe_daddr;
- gw = fnhe->fnhe_gw;
- pmtu = fnhe->fnhe_pmtu;
- expires = fnhe->fnhe_expires;
- if (read_seqretry(&fnhe_seqlock, seq))
- goto restart;
- if (daddr != fnhe_daddr)
- continue;
- if (pmtu) {
- unsigned long diff = expires - jiffies;
+ if (fnhe->fnhe_daddr == daddr)
+ return fnhe;
+ }
+ return NULL;
+}
- if (time_before(jiffies, expires)) {
- rt->rt_pmtu = pmtu;
- dst_set_expires(&rt->dst, diff);
- }
+static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ __be32 daddr)
+{
+ __be32 fnhe_daddr, gw;
+ unsigned long expires;
+ unsigned int seq;
+ u32 pmtu;
+
+restart:
+ seq = read_seqbegin(&fnhe_seqlock);
+ fnhe_daddr = fnhe->fnhe_daddr;
+ gw = fnhe->fnhe_gw;
+ pmtu = fnhe->fnhe_pmtu;
+ expires = fnhe->fnhe_expires;
+ if (read_seqretry(&fnhe_seqlock, seq))
+ goto restart;
+
+ if (daddr != fnhe_daddr)
+ return;
+
+ if (pmtu) {
+ unsigned long diff = expires - jiffies;
+
+ if (time_before(jiffies, expires)) {
+ rt->rt_pmtu = pmtu;
+ dst_set_expires(&rt->dst, diff);
}
- if (gw)
- rt->rt_gateway = gw;
- fnhe->fnhe_stamp = jiffies;
- break;
}
+ if (gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = gw;
+ }
+ fnhe->fnhe_stamp = jiffies;
}
-static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+static inline void rt_release_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+ dst_release(dst);
+}
+
+static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+ struct rtable *orig, *prev, **p = &nh->nh_rth_output;
+
+ if (rt_is_input_route(rt))
+ p = &nh->nh_rth_input;
+
+ orig = *p;
+
+ prev = cmpxchg(p, orig, rt);
+ if (prev == orig) {
+ dst_clone(&rt->dst);
+ if (orig)
+ call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
+ }
+}
+
+static bool rt_cache_valid(struct rtable *rt)
+{
+ return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
+ struct fib_nh_exception *fnhe,
struct fib_info *fi, u16 type, u32 itag)
{
if (fi) {
@@ -1949,12 +1237,14 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = nh->nh_gw;
- if (unlikely(nh->nh_exceptions))
- rt_bind_exception(rt, nh, fl4->daddr);
- rt_init_metrics(rt, fl4, fi);
+ if (unlikely(fnhe))
+ rt_bind_exception(rt, fnhe, daddr);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
#ifdef CONFIG_IP_ROUTE_CLASSID
- rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+ rt->dst.tclassid = nh->nh_tclassid;
#endif
+ if (!(rt->dst.flags & DST_HOST))
+ rt_cache_route(nh, rt);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1966,10 +1256,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
}
static struct rtable *rt_dst_alloc(struct net_device *dev,
- bool nopolicy, bool noxfrm)
+ bool nopolicy, bool noxfrm, bool will_cache)
{
- return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
- DST_HOST |
+ return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
}
@@ -1978,7 +1268,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
{
- unsigned int hash;
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
u32 itag = 0;
@@ -2007,7 +1296,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto e_err;
}
rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
@@ -2016,21 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
rth->dst.output = ip_rt_bug;
- rth->rt_key_dst = daddr;
- rth->rt_key_src = saddr;
rth->rt_genid = rt_genid(dev_net(dev));
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
- rth->rt_key_tos = tos;
- rth->rt_dst = daddr;
- rth->rt_src = saddr;
- rth->rt_route_iif = dev->ifindex;
+ rth->rt_is_input= 1;
rth->rt_iif = dev->ifindex;
- rth->rt_oif = 0;
- rth->rt_mark = skb->mark;
rth->rt_pmtu = 0;
- rth->rt_gateway = daddr;
- rth->fi = NULL;
+ rth->rt_gateway = 0;
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
@@ -2042,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
RT_CACHE_STAT_INC(in_slow_mc);
- hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
- rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
- return IS_ERR(rth) ? PTR_ERR(rth) : 0;
+ skb_dst_set(skb, &rth->dst);
+ return 0;
e_nobufs:
return -ENOBUFS;
@@ -2091,6 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb,
int err;
struct in_device *out_dev;
unsigned int flags = 0;
+ bool do_cache;
u32 itag;
/* get a working reference to the output device */
@@ -2133,35 +1414,39 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
+ do_cache = false;
+ if (res->fi) {
+ if (!(flags & RTCF_DIRECTSRC) && !itag) {
+ rth = FIB_RES_NH(*res).nh_rth_input;
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ goto out;
+ }
+ do_cache = true;
+ }
+ }
+
rth = rt_dst_alloc(out_dev->dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM));
+ IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
- rth->rt_key_dst = daddr;
- rth->rt_key_src = saddr;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
rth->rt_flags = flags;
rth->rt_type = res->type;
- rth->rt_key_tos = tos;
- rth->rt_dst = daddr;
- rth->rt_src = saddr;
- rth->rt_route_iif = in_dev->dev->ifindex;
+ rth->rt_is_input = 1;
rth->rt_iif = in_dev->dev->ifindex;
- rth->rt_oif = 0;
- rth->rt_mark = skb->mark;
rth->rt_pmtu = 0;
- rth->rt_gateway = daddr;
- rth->fi = NULL;
+ rth->rt_gateway = 0;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
- rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
-
+ rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+out:
*result = rth;
err = 0;
cleanup:
@@ -2176,7 +1461,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
{
struct rtable *rth = NULL;
int err;
- unsigned int hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
@@ -2188,12 +1472,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
if (err)
return err;
- /* put it into the cache */
- hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
- rt_genid(dev_net(rth->dst.dev)));
- rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
- if (IS_ERR(rth))
- return PTR_ERR(rth);
+ skb_dst_set(skb, &rth->dst);
return 0;
}
@@ -2217,9 +1496,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned int flags = 0;
u32 itag = 0;
struct rtable *rth;
- unsigned int hash;
int err = -EINVAL;
struct net *net = dev_net(dev);
+ bool do_cache;
/* IP on this device is disabled. */
@@ -2233,6 +1512,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
+ res.fi = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -2308,8 +1588,20 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
+ do_cache = false;
+ if (res.fi) {
+ if (!(flags & RTCF_DIRECTSRC) && !itag) {
+ rth = FIB_RES_NH(res).nh_rth_input;
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ goto set_and_out;
+ }
+ do_cache = true;
+ }
+ }
+
rth = rt_dst_alloc(net->loopback_dev,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
@@ -2319,31 +1611,23 @@ local_input:
rth->dst.tclassid = itag;
#endif
- rth->rt_key_dst = daddr;
- rth->rt_key_src = saddr;
rth->rt_genid = rt_genid(net);
rth->rt_flags = flags|RTCF_LOCAL;
rth->rt_type = res.type;
- rth->rt_key_tos = tos;
- rth->rt_dst = daddr;
- rth->rt_src = saddr;
- rth->rt_route_iif = dev->ifindex;
+ rth->rt_is_input = 1;
rth->rt_iif = dev->ifindex;
- rth->rt_oif = 0;
- rth->rt_mark = skb->mark;
rth->rt_pmtu = 0;
- rth->rt_gateway = daddr;
- rth->fi = NULL;
+ rth->rt_gateway = 0;
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
- hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
- rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+ if (do_cache)
+ rt_cache_route(&FIB_RES_NH(res), rth);
+set_and_out:
+ skb_dst_set(skb, &rth->dst);
err = 0;
- if (IS_ERR(rth))
- err = PTR_ERR(rth);
goto out;
no_route:
@@ -2379,49 +1663,13 @@ martian_source_keep_err:
goto out;
}
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
{
- struct rtable *rth;
- unsigned int hash;
- int iif = dev->ifindex;
- struct net *net;
int res;
- net = dev_net(dev);
-
rcu_read_lock();
- if (!rt_caching(net))
- goto skip_cache;
-
- tos &= IPTOS_RT_MASK;
- hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
- ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
- (rth->rt_route_iif ^ iif) |
- (rth->rt_key_tos ^ tos)) == 0 &&
- rth->rt_mark == skb->mark &&
- net_eq(dev_net(rth->dst.dev), net) &&
- !rt_is_expired(rth)) {
- if (noref) {
- dst_use_noref(&rth->dst, jiffies);
- skb_dst_set_noref(skb, &rth->dst);
- } else {
- dst_use(&rth->dst, jiffies);
- skb_dst_set(skb, &rth->dst);
- }
- RT_CACHE_STAT_INC(in_hit);
- rcu_read_unlock();
- return 0;
- }
- RT_CACHE_STAT_INC(in_hlist_search);
- }
-
-skip_cache:
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
@@ -2459,17 +1707,16 @@ skip_cache:
rcu_read_unlock();
return res;
}
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
- const struct flowi4 *fl4,
- __be32 orig_daddr, __be32 orig_saddr,
- int orig_oif, __u8 orig_rtos,
+ const struct flowi4 *fl4, int orig_oif,
struct net_device *dev_out,
unsigned int flags)
{
struct fib_info *fi = res->fi;
+ struct fib_nh_exception *fnhe;
struct in_device *in_dev;
u16 type = res->type;
struct rtable *rth;
@@ -2508,29 +1755,33 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fi = NULL;
}
+ fnhe = NULL;
+ if (fi) {
+ fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+ if (!fnhe) {
+ rth = FIB_RES_NH(*res).nh_rth_output;
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ return rth;
+ }
+ }
+ }
rth = rt_dst_alloc(dev_out,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM));
+ IN_DEV_CONF_GET(in_dev, NOXFRM),
+ fi && !fnhe);
if (!rth)
return ERR_PTR(-ENOBUFS);
rth->dst.output = ip_output;
- rth->rt_key_dst = orig_daddr;
- rth->rt_key_src = orig_saddr;
rth->rt_genid = rt_genid(dev_net(dev_out));
rth->rt_flags = flags;
rth->rt_type = type;
- rth->rt_key_tos = orig_rtos;
- rth->rt_dst = fl4->daddr;
- rth->rt_src = fl4->saddr;
- rth->rt_route_iif = 0;
+ rth->rt_is_input = 0;
rth->rt_iif = orig_oif ? : dev_out->ifindex;
- rth->rt_oif = orig_oif;
- rth->rt_mark = fl4->flowi4_mark;
rth->rt_pmtu = 0;
- rth->rt_gateway = fl4->daddr;
- rth->fi = NULL;
+ rth->rt_gateway = 0;
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2553,36 +1804,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
#endif
}
- rt_set_nexthop(rth, fl4, res, fi, type, 0);
-
- if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
- rth->dst.flags |= DST_NOCACHE;
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
return rth;
}
/*
* Major route resolver routine.
- * called with rcu_read_lock();
*/
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- __be32 orig_daddr;
- __be32 orig_saddr;
int orig_oif;
res.tclassid = 0;
res.fi = NULL;
res.table = NULL;
- orig_daddr = fl4->daddr;
- orig_saddr = fl4->saddr;
orig_oif = fl4->flowi4_oif;
fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -2744,59 +1987,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
make_route:
- rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
- tos, dev_out, flags);
- if (!IS_ERR(rth)) {
- unsigned int hash;
-
- hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
- rt_genid(dev_net(dev_out)));
- rth = rt_intern_hash(hash, rth, NULL, orig_oif);
- }
+ rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
out:
rcu_read_unlock();
return rth;
}
-
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
- struct rtable *rth;
- unsigned int hash;
-
- if (!rt_caching(net))
- goto slow_output;
-
- hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-
- rcu_read_lock_bh();
- for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference_bh(rth->dst.rt_next)) {
- if (rth->rt_key_dst == flp4->daddr &&
- rth->rt_key_src == flp4->saddr &&
- rt_is_output_route(rth) &&
- rth->rt_oif == flp4->flowi4_oif &&
- rth->rt_mark == flp4->flowi4_mark &&
- !((rth->rt_key_tos ^ flp4->flowi4_tos) &
- (IPTOS_RT_MASK | RTO_ONLINK)) &&
- net_eq(dev_net(rth->dst.dev), net) &&
- !rt_is_expired(rth)) {
- dst_use(&rth->dst, jiffies);
- RT_CACHE_STAT_INC(out_hit);
- rcu_read_unlock_bh();
- if (!flp4->saddr)
- flp4->saddr = rth->rt_src;
- if (!flp4->daddr)
- flp4->daddr = rth->rt_dst;
- return rth;
- }
- RT_CACHE_STAT_INC(out_hlist_search);
- }
- rcu_read_unlock_bh();
-
-slow_output:
- return ip_route_output_slow(net, flp4);
-}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -2830,7 +2026,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
- .destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
.mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
@@ -2842,9 +2037,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *rt;
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
@@ -2856,24 +2052,14 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
if (new->dev)
dev_hold(new->dev);
- rt->rt_key_dst = ort->rt_key_dst;
- rt->rt_key_src = ort->rt_key_src;
- rt->rt_key_tos = ort->rt_key_tos;
- rt->rt_route_iif = ort->rt_route_iif;
+ rt->rt_is_input = ort->rt_is_input;
rt->rt_iif = ort->rt_iif;
- rt->rt_oif = ort->rt_oif;
- rt->rt_mark = ort->rt_mark;
rt->rt_pmtu = ort->rt_pmtu;
rt->rt_genid = rt_genid(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
- rt->rt_dst = ort->rt_dst;
- rt->rt_src = ort->rt_src;
rt->rt_gateway = ort->rt_gateway;
- rt->fi = ort->fi;
- if (rt->fi)
- atomic_inc(&rt->fi->fib_clntref);
dst_free(new);
}
@@ -2900,9 +2086,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net,
- struct sk_buff *skb, u32 pid, u32 seq, int event,
- int nowait, unsigned int flags)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+ u32 seq, int event, int nowait, unsigned int flags)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
@@ -2919,7 +2105,7 @@ static int rt_fill_info(struct net *net,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = rt->rt_key_tos;
+ r->rtm_tos = fl4->flowi4_tos;
r->rtm_table = RT_TABLE_MAIN;
if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
goto nla_put_failure;
@@ -2930,11 +2116,11 @@ static int rt_fill_info(struct net *net,
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
- if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+ if (nla_put_be32(skb, RTA_DST, dst))
goto nla_put_failure;
- if (rt->rt_key_src) {
+ if (src) {
r->rtm_src_len = 32;
- if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+ if (nla_put_be32(skb, RTA_SRC, src))
goto nla_put_failure;
}
if (rt->dst.dev &&
@@ -2946,11 +2132,11 @@ static int rt_fill_info(struct net *net,
goto nla_put_failure;
#endif
if (!rt_is_input_route(rt) &&
- rt->rt_src != rt->rt_key_src) {
- if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+ fl4->saddr != src) {
+ if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_dst != rt->rt_gateway &&
+ if (rt->rt_gateway &&
nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
goto nla_put_failure;
@@ -2960,8 +2146,8 @@ static int rt_fill_info(struct net *net,
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (rt->rt_mark &&
- nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+ if (fl4->flowi4_mark &&
+ nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
error = rt->dst.error;
@@ -2974,29 +2160,8 @@ static int rt_fill_info(struct net *net,
}
if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
- __be32 dst = rt->rt_dst;
-
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb,
- rt->rt_src, rt->rt_dst,
- r, nowait);
- if (err <= 0) {
- if (!nowait) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- error = err;
- }
- }
- } else
-#endif
- if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
- goto nla_put_failure;
+ if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ goto nla_put_failure;
}
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -3015,6 +2180,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
struct rtable *rt = NULL;
+ struct flowi4 fl4;
__be32 dst = 0;
__be32 src = 0;
u32 iif;
@@ -3049,6 +2215,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst;
+ fl4.saddr = src;
+ fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+ fl4.flowi4_mark = mark;
+
if (iif) {
struct net_device *dev;
@@ -3069,13 +2242,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
- struct flowi4 fl4 = {
- .daddr = dst,
- .saddr = src,
- .flowi4_tos = rtm->rtm_tos,
- .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
- .flowi4_mark = mark,
- };
rt = ip_route_output_key(net, &fl4);
err = 0;
@@ -3090,7 +2256,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+ err = rt_fill_info(net, dst, src, &fl4, skb,
+ NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err <= 0)
goto errout_free;
@@ -3106,43 +2273,6 @@ errout_free:
int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct rtable *rt;
- int h, s_h;
- int idx, s_idx;
- struct net *net;
-
- net = sock_net(skb->sk);
-
- s_h = cb->args[0];
- if (s_h < 0)
- s_h = 0;
- s_idx = idx = cb->args[1];
- for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
- if (!rt_hash_table[h].chain)
- continue;
- rcu_read_lock_bh();
- for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
- if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
- continue;
- if (rt_is_expired(rt))
- continue;
- skb_dst_set_noref(skb, &rt->dst);
- if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- 1, NLM_F_MULTI) <= 0) {
- skb_dst_drop(skb);
- rcu_read_unlock_bh();
- goto done;
- }
- skb_dst_drop(skb);
- }
- rcu_read_unlock_bh();
- }
-
-done:
- cb->args[0] = h;
- cb->args[1] = idx;
return skb->len;
}
@@ -3376,22 +2506,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_IP_ROUTE_CLASSID */
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
- ssize_t ret;
-
- if (!str)
- return 0;
-
- ret = kstrtoul(str, 0, &rhash_entries);
- if (ret)
- return 0;
-
- return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
int __init ip_rt_init(void)
{
int rc = 0;
@@ -3414,31 +2528,12 @@ int __init ip_rt_init(void)
if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
- rt_hash_table = (struct rt_hash_bucket *)
- alloc_large_system_hash("IP route cache",
- sizeof(struct rt_hash_bucket),
- rhash_entries,
- (totalram_pages >= 128 * 1024) ?
- 15 : 17,
- 0,
- &rt_hash_log,
- &rt_hash_mask,
- 0,
- rhash_entries ? 0 : 512 * 1024);
- memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
- rt_hash_lock_init();
-
- ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
- ip_rt_max_size = (rt_hash_mask + 1) * 16;
+ ipv4_dst_ops.gc_thresh = ~0;
+ ip_rt_max_size = INT_MAX;
devinet_init();
ip_fib_init();
- INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
- expires_ljiffies = jiffies;
- schedule_delayed_work(&expires_work,
- net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
if (ip_rt_proc_init())
pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM