From 89aef8921bfbac22f00e04f8450f6e447db13e42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 11:00:09 -0700 Subject: ipv4: Delete routing cache. The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/fib_frontend.c | 5 - net/ipv4/route.c | 940 +----------------------------------------------- 3 files changed, 13 insertions(+), 933 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index ace3cb442519..5dcfeb621e06 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); -extern void rt_cache_flush_batch(struct net *net); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b83203658ee3..f277cf0e6321 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(dev_net(dev), 0); break; case NETDEV_UNREGISTER_BATCH: - /* The batch unregister is only called on the first - * device in the list of devices being unregistered. - * Therefore we should not pass dev_net(dev) in here. - */ - rt_cache_flush_batch(NULL); break; } return NOTIFY_DONE; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d547f6fae20d..6d6146d31f22 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int rt_chain_length_max __read_mostly = 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; /* * Interface to generic destination cache. @@ -152,7 +148,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) @@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .gc = rt_garbage_collect, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, @@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = { }; EXPORT_SYMBOL(ip_tos2prio); -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable __rcu *chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define RT_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define RT_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define RT_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define RT_HASH_LOCK_SZ 512 -# else -# define RT_HASH_LOCK_SZ 256 -# endif -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ - int i; - - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, - GFP_KERNEL); - if (!rt_hash_locks) - panic("IP: failed to allocate rt_hash_locks\n"); - - for (i = 0; i < RT_HASH_LOCK_SZ; i++) - spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket *rt_hash_table __read_mostly; -static unsigned int rt_hash_mask __read_mostly; -static unsigned int rt_hash_log __read_mostly; - static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) -{ - return jhash_3words((__force u32)daddr, (__force u32)saddr, - idx, genid) - & rt_hash_mask; -} - static inline int rt_genid(struct net *net) { return atomic_read(&net->ipv4.rt_genid); } #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - struct seq_net_private p; - int bucket; - int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ - struct rt_cache_iter_state *st = seq->private; - struct rtable *r = NULL; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) - continue; - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - while (r) { - if (dev_net(r->dst.dev) == seq_file_net(seq) && - r->rt_genid == st->genid) - return r; - r = rcu_dereference_bh(r->dst.rt_next); - } - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - - r = rcu_dereference_bh(r->dst.rt_next); - while (!r) { - rcu_read_unlock_bh(); - do { - if (--st->bucket < 0) - return NULL; - } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - } - return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->dst.dev) != seq_file_net(seq)) - continue; - if (r->rt_genid == st->genid) - break; - } - return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(seq); - - if (r) - while (pos && (r = rt_cache_get_next(seq, r))) - --pos; - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - struct rt_cache_iter_state *st = seq->private; if (*pos) - return rt_cache_get_idx(seq, *pos - 1); - st->genid = rt_genid(seq_file_net(seq)); + return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(seq); - else - r = rt_cache_get_next(seq, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - int len; - - seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->dst.dev ? r->dst.dev->name : "*", - (__force u32)r->rt_dst, - (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->dst.__refcnt), - r->dst.__use, 0, (__force u32)r->rt_src, - dst_metric_advmss(&r->dst) + 40, - dst_metric(&r->dst, RTAX_WINDOW), 0, - r->rt_key_tos, - -1, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); - } return 0; } @@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &rt_cache_seq_ops, - sizeof(struct rt_cache_iter_state)); + return seq_open(file, &rt_cache_seq_ops); } static const struct file_operations rt_cache_seq_fops = { @@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_net, + .release = seq_release, }; @@ -625,262 +446,11 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline void rt_drop(struct rtable *rt) -{ - ip_rt_put(rt); - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline int rt_fast_clean(struct rtable *rth) -{ - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rt_is_input_route(rth) && rth->dst.rt_next; -} - -static inline int rt_valuable(struct rtable *rth) -{ - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; -} - -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ - unsigned long age; - int ret = 0; - - if (atomic_read(&rth->dst.__refcnt)) - goto out; - - age = jiffies - rth->dst.lastuse; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; -} - -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) -{ - u32 score = jiffies - rt->dst.lastuse; - - score = ~score & ~(3<<30); - - if (rt_valuable(rt)) - score |= (1<<31); - - if (rt_is_output_route(rt) || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; -} - -static inline bool rt_caching(const struct net *net) -{ - return net->ipv4.current_rt_cache_rebuild_count <= - net->ipv4.sysctl_rt_cache_rebuild_count; -} - -static inline bool compare_hash_inputs(const struct rtable *rt1, - const struct rtable *rt2) -{ - return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); -} - -static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) -{ - return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_mark ^ rt2->rt_mark) | - (rt1->rt_key_tos ^ rt2->rt_key_tos) | - (rt1->rt_route_iif ^ rt2->rt_route_iif) | - (rt1->rt_oif ^ rt2->rt_oif)) == 0; -} - -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ - return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); -} - static inline int rt_is_expired(struct rtable *rth) { return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(struct net *net, int process_context) -{ - unsigned int i; - struct rtable *rth, *next; - - for (i = 0; i <= rt_hash_mask; i++) { - struct rtable __rcu **pprev; - struct rtable *list; - - if (process_context && need_resched()) - cond_resched(); - rth = rcu_access_pointer(rt_hash_table[i].chain); - if (!rth) - continue; - - spin_lock_bh(rt_hash_lock_addr(i)); - - list = NULL; - pprev = &rt_hash_table[i].chain; - rth = rcu_dereference_protected(*pprev, - lockdep_is_held(rt_hash_lock_addr(i))); - - while (rth) { - next = rcu_dereference_protected(rth->dst.rt_next, - lockdep_is_held(rt_hash_lock_addr(i))); - - if (!net || - net_eq(dev_net(rth->dst.dev), net)) { - rcu_assign_pointer(*pprev, next); - rcu_assign_pointer(rth->dst.rt_next, list); - list = rth; - } else { - pprev = &rth->dst.rt_next; - } - rth = next; - } - - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; list; list = next) { - next = rcu_dereference_protected(list->dst.rt_next, 1); - rt_free(list); - } - } -} - -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - * rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -/* - * Given a hash chain and an item in this hash chain, - * find if a previous entry has the same hash_inputs - * (but differs on tos, mark or oif) - * Returns 0 if an alias is found. - * Returns ONE if rth has no alias before itself. - */ -static int has_noalias(const struct rtable *head, const struct rtable *rth) -{ - const struct rtable *aux = head; - - while (aux != rth) { - if (compare_hash_inputs(aux, rth)) - return 0; - aux = rcu_dereference_protected(aux->dst.rt_next, 1); - } - return ONE; -} - -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth) || - rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - - /* We only count entries on a chain with equal - * hash inputs once so that entries for - * different QOS levels, and other non-hash - * input attributes don't unfairly skew the - * length computation - */ - tmo >>= 1; - rthp = &rth->dst.rt_next; - length += has_noalias(rt_hash_table[i].chain, rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net) void rt_cache_flush(struct net *net, int delay) { rt_cache_invalidate(net); - if (delay >= 0) - rt_do_flush(net, !in_softirq()); -} - -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(struct net *net) -{ - rt_do_flush(net, !in_softirq()); -} - -static void rt_emergency_hash_rebuild(struct net *net) -{ - net_warn_ratelimited("Route hash chain too long!\n"); - rt_cache_invalidate(net); -} - -/* - Short description of GC goals. - - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. - - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) -{ - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long now = jiffies; - int goal; - int entries = dst_entries_get_fast(&ipv4_dst_ops); - - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ - - RT_CACHE_STAT_INC(gc_total); - - if (now - last_gc < ip_rt_gc_min_interval && - entries < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; - } - - entries = dst_entries_get_slow(&ipv4_dst_ops); - /* Calculate number of entries, which we want to expire now. */ - goal = entries - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = entries - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = entries - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = entries - goal; - } - - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; - - if (goal <= 0) { - equilibrium += goal; - goto work_done; - } - - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { - if (!rt_is_expired(rth) && - !rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->dst.rt_next; - continue; - } - *rthp = rth->dst.rt_next; - rt_free(rth); - goal--; - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); - - if (expire == 0) - break; - - expire >>= 1; - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - net_warn_ratelimited("dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || - dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -out: return 0; -} - -/* - * Returns number of entries in a hash chain that have different hash_inputs - */ -static int slow_chain_length(const struct rtable *head) -{ - int length = 0; - const struct rtable *rth = head; - - while (rth) { - length += has_noalias(head, rth); - rth = rcu_dereference_protected(rth->dst.rt_next, 1); - } - return length >> FRACT_BITS; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, - struct sk_buff *skb, int ifindex) -{ - struct rtable *rth, *cand; - struct rtable __rcu **rthp, **candp; - unsigned long now; - u32 min_score; - int chain_length; - -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; - - if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { - /* - * If we're not caching, just tell the caller we - * were successful and don't touch the route. The - * caller hold the sole reference to the cache entry, and - * it will be released when the caller is done with it. - * If we drop it here, the callers have no way to resolve routes - * when we're not caching. Instead, just point *rp at rt, so - * the caller gets a single use out of the route - * Note that we do rt_free on this new route entry, so that - * once its refcount hits zero, we are still able to reap it - * (Thanks Alexey) - * Note: To avoid expensive rcu stuff for this uncached dst, - * we set DST_NOCACHE so that dst_release() can free dst without - * waiting a grace period. - */ - - rt->dst.flags |= DST_NOCACHE; - goto skip_hashing; - } - - rthp = &rt_hash_table[hash].chain; - - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (compare_keys(rth, rt) && compare_netns(rth, rt)) { - /* Put it first */ - *rthp = rth->dst.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->dst.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); - - dst_use(&rth->dst, now); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - rt_drop(rt); - if (skb) - skb_dst_set(skb, &rth->dst); - return rth; - } - - if (!atomic_read(&rth->dst.__refcnt)) { - u32 score = rt_score(rth); - - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } - } - - chain_length++; - - rthp = &rth->dst.rt_next; - } - - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. - */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->dst.rt_next; - rt_free(cand); - } - } else { - if (chain_length > rt_chain_length_max && - slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->dst.dev); - int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(net)) { - pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", - rt->dst.dev->name, num); - } - rt_emergency_hash_rebuild(net); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - ifindex, rt_genid(net)); - goto restart; - } - } - - rt->dst.rt_next = rt_hash_table[hash].chain; - - /* - * Since lookup is lockfree, we must make sure - * previous writes to rt are committed to memory - * before making rt visible to other CPUS. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rt); - - spin_unlock_bh(rt_hash_lock_addr(hash)); - -skip_hashing: - if (skb) - skb_dst_set(skb, &rt->dst); - return rt; -} - /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. @@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) } EXPORT_SYMBOL(__ip_select_ident); -static void rt_del(unsigned int hash, struct rtable *rt) -{ - struct rtable __rcu **rthp; - struct rtable *aux; - - rthp = &rt_hash_table[hash].chain; - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - while ((aux = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->dst.rt_next; - rt_free(aux); - continue; - } - rthp = &aux->dst.rt_next; - } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} - static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, @@ -1518,10 +774,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) { - unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - rt->rt_oif, - rt_genid(dev_net(dst->dev))); - rt_del(hash, rt); + ip_rt_put(rt); ret = NULL; } } @@ -1969,7 +1222,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm) { return dst_alloc(&ipv4_dst_ops, dev, 1, -1, - DST_HOST | + DST_HOST | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1978,7 +1231,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned int hash; struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; @@ -2042,9 +1294,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif RT_CACHE_STAT_INC(in_slow_mc); - hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - rth = rt_intern_hash(hash, rth, skb, dev->ifindex); - return IS_ERR(rth) ? PTR_ERR(rth) : 0; + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: return -ENOBUFS; @@ -2176,7 +1427,6 @@ static int ip_mkroute_input(struct sk_buff *skb, { struct rtable *rth = NULL; int err; - unsigned int hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) @@ -2188,12 +1438,7 @@ static int ip_mkroute_input(struct sk_buff *skb, if (err) return err; - /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl4->flowi4_iif, - rt_genid(dev_net(rth->dst.dev))); - rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); - if (IS_ERR(rth)) - return PTR_ERR(rth); + skb_dst_set(skb, &rth->dst); return 0; } @@ -2217,7 +1462,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - unsigned int hash; int err = -EINVAL; struct net *net = dev_net(dev); @@ -2339,11 +1583,8 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); - rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + skb_dst_set(skb, &rth->dst); err = 0; - if (IS_ERR(rth)) - err = PTR_ERR(rth); goto out; no_route: @@ -2382,46 +1623,10 @@ martian_source_keep_err: int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, bool noref) { - struct rtable *rth; - unsigned int hash; - int iif = dev->ifindex; - struct net *net; int res; - net = dev_net(dev); - rcu_read_lock(); - if (!rt_caching(net)) - goto skip_cache; - - tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | - ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | - (rth->rt_route_iif ^ iif) | - (rth->rt_key_tos ^ tos)) == 0 && - rth->rt_mark == skb->mark && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - if (noref) { - dst_use_noref(&rth->dst, jiffies); - skb_dst_set_noref(skb, &rth->dst); - } else { - dst_use(&rth->dst, jiffies); - skb_dst_set(skb, &rth->dst); - } - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - -skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2563,10 +1768,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, /* * Major route resolver routine. - * called with rcu_read_lock(); */ -static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2746,57 +1950,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) make_route: rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, tos, dev_out, flags); - if (!IS_ERR(rth)) { - unsigned int hash; - - hash = rt_hash(orig_daddr, orig_saddr, orig_oif, - rt_genid(dev_net(dev_out))); - rth = rt_intern_hash(hash, rth, NULL, orig_oif); - } out: rcu_read_unlock(); return rth; } - -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) -{ - struct rtable *rth; - unsigned int hash; - - if (!rt_caching(net)) - goto slow_output; - - hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); - - rcu_read_lock_bh(); - for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->rt_key_dst == flp4->daddr && - rth->rt_key_src == flp4->saddr && - rt_is_output_route(rth) && - rth->rt_oif == flp4->flowi4_oif && - rth->rt_mark == flp4->flowi4_mark && - !((rth->rt_key_tos ^ flp4->flowi4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - dst_use(&rth->dst, jiffies); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - if (!flp4->saddr) - flp4->saddr = rth->rt_src; - if (!flp4->daddr) - flp4->daddr = rth->rt_dst; - return rth; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); - -slow_output: - return ip_route_output_slow(net, flp4); -} EXPORT_SYMBOL_GPL(__ip_route_output_key); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) @@ -3106,43 +2264,6 @@ errout_free: int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct rtable *rt; - int h, s_h; - int idx, s_idx; - struct net *net; - - net = sock_net(skb->sk); - - s_h = cb->args[0]; - if (s_h < 0) - s_h = 0; - s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { - if (!rt_hash_table[h].chain) - continue; - rcu_read_lock_bh(); - for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) - continue; - if (rt_is_expired(rt)) - continue; - skb_dst_set_noref(skb, &rt->dst); - if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - skb_dst_drop(skb); - rcu_read_unlock_bh(); - goto done; - } - skb_dst_drop(skb); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; return skb->len; } @@ -3376,22 +2497,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) -{ - ssize_t ret; - - if (!str) - return 0; - - ret = kstrtoul(str, 0, &rhash_entries); - if (ret) - return 0; - - return 1; -} -__setup("rhash_entries=", set_rhash_entries); - int __init ip_rt_init(void) { int rc = 0; @@ -3414,31 +2519,12 @@ int __init ip_rt_init(void) if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (totalram_pages >= 128 * 1024) ? - 15 : 17, - 0, - &rt_hash_log, - &rt_hash_mask, - 0, - rhash_entries ? 0 : 512 * 1024); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM -- cgit v1.2.3-58-ga151 From 38a424e4657462fe9f8b76f01a0e879abde99ab4 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:53 +0000 Subject: ipv4: Kill ip_route_input_noref(). The "noref" argument to ip_route_input_common() is now always ignored because we do not cache routes, and in that case we must always grab a reference to the resulting 'dst'. Signed-off-by: David S. Miller --- include/net/route.h | 16 ++-------------- net/ipv4/arp.c | 2 +- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_input.c | 4 ++-- net/ipv4/route.c | 6 +++--- net/ipv4/xfrm4_input.c | 4 ++-- 6 files changed, 12 insertions(+), 24 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 5dcfeb621e06..5c86c4773b2b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref); - -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, false); -} - -static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, true); -} +extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin); extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 2e560f0c757d..c38293f38161 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { + ip_route_input(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8d07c973409c..7ad88e5e7110 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg) /* skb dst is stale, drop it, and perform route lookup again */ skb_dst_drop(head); iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b27d4440f523..4ebc6feee250 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + int err = ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6d6146d31f22..55eb4634ed60 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1620,8 +1620,8 @@ martian_source_keep_err: goto out; } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref) +int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { int res; @@ -1664,7 +1664,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_unlock(); return res; } -EXPORT_SYMBOL(ip_route_input_common); +EXPORT_SYMBOL(ip_route_input); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216dc..58d23a572509 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); - if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) goto drop; } return dst_input(skb); -- cgit v1.2.3-58-ga151 From 1a00fee4ffb22312a0ac40045ecd6f222b55eb3d Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:56 +0000 Subject: ipv4: Remove rt_key_{src,dst,tos} from struct rtable. They are always used in contexts where they can be reconstituted, or where the finally resolved rt->rt_{src,dst} is semantically equivalent. Signed-off-by: David S. Miller --- include/net/route.h | 5 ----- net/ipv4/route.c | 39 +++++++++------------------------------ net/ipv4/xfrm4_policy.c | 3 --- 3 files changed, 9 insertions(+), 38 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 5c86c4773b2b..935fa59630b0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -44,14 +44,9 @@ struct fib_info; struct rtable { struct dst_entry dst; - /* Lookup key. */ - __be32 rt_key_dst; - __be32 rt_key_src; - int rt_genid; unsigned int rt_flags; __u16 rt_type; - __u8 rt_key_tos; __be32 rt_dst; /* Path destination */ __be32 rt_src; /* Path source */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 55eb4634ed60..c89d690acdfa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1268,12 +1268,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->dst.output = ip_rt_bug; - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; - rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_route_iif = dev->ifindex; @@ -1392,12 +1389,9 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; - rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_route_iif = in_dev->dev->ifindex; @@ -1563,12 +1557,9 @@ local_input: rth->dst.tclassid = itag; #endif - rth->rt_key_dst = daddr; - rth->rt_key_src = saddr; rth->rt_genid = rt_genid(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - rth->rt_key_tos = tos; rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_route_iif = dev->ifindex; @@ -1668,9 +1659,7 @@ EXPORT_SYMBOL(ip_route_input); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, - const struct flowi4 *fl4, - __be32 orig_daddr, __be32 orig_saddr, - int orig_oif, __u8 orig_rtos, + const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { @@ -1721,12 +1710,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->dst.output = ip_output; - rth->rt_key_dst = orig_daddr; - rth->rt_key_src = orig_saddr; rth->rt_genid = rt_genid(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; - rth->rt_key_tos = orig_rtos; rth->rt_dst = fl4->daddr; rth->rt_src = fl4->saddr; rth->rt_route_iif = 0; @@ -1777,16 +1763,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) unsigned int flags = 0; struct fib_result res; struct rtable *rth; - __be32 orig_daddr; - __be32 orig_saddr; int orig_oif; res.tclassid = 0; res.fi = NULL; res.table = NULL; - orig_daddr = fl4->daddr; - orig_saddr = fl4->saddr; orig_oif = fl4->flowi4_oif; fl4->flowi4_iif = net->loopback_dev->ifindex; @@ -1948,8 +1930,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) make_route: - rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, - tos, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); out: rcu_read_unlock(); @@ -2014,9 +1995,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or if (new->dev) dev_hold(new->dev); - rt->rt_key_dst = ort->rt_key_dst; - rt->rt_key_src = ort->rt_key_src; - rt->rt_key_tos = ort->rt_key_tos; rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; @@ -2058,7 +2036,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, +static int rt_fill_info(struct net *net, __be32 src, u8 tos, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2077,7 +2055,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->rt_key_tos; + r->rtm_tos = tos; r->rtm_table = RT_TABLE_MAIN; if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) goto nla_put_failure; @@ -2090,9 +2068,9 @@ static int rt_fill_info(struct net *net, if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) goto nla_put_failure; - if (rt->rt_key_src) { + if (src) { r->rtm_src_len = 32; - if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) + if (nla_put_be32(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && @@ -2104,7 +2082,7 @@ static int rt_fill_info(struct net *net, goto nla_put_failure; #endif if (!rt_is_input_route(rt) && - rt->rt_src != rt->rt_key_src) { + rt->rt_src != src) { if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) goto nla_put_failure; } @@ -2248,7 +2226,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + err = rt_fill_info(net, src, rtm->rtm_tos, skb, + NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index fcf7678bc009..2a8d5cfc340f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -79,9 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.rt_key_dst = fl4->daddr; - xdst->u.rt.rt_key_src = fl4->saddr; - xdst->u.rt.rt_key_tos = fl4->flowi4_tos; xdst->u.rt.rt_route_iif = fl4->flowi4_iif; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.rt.rt_oif = fl4->flowi4_oif; -- cgit v1.2.3-58-ga151 From d6c0a4f609847d6e65658913f9ccbcb1c137cff3 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:59 +0000 Subject: ipv4: Kill 'rt_src' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/route.c | 34 +++++++++++++++------------------- net/ipv4/xfrm4_policy.c | 1 - 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 935fa59630b0..85d1093e42de 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -49,7 +49,6 @@ struct rtable { __u16 rt_type; __be32 rt_dst; /* Path destination */ - __be32 rt_src; /* Path source */ int rt_route_iif; int rt_iif; int rt_oif; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c89d690acdfa..fc1199dc23e7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1272,7 +1272,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; rth->rt_dst = daddr; - rth->rt_src = saddr; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; @@ -1393,7 +1392,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_flags = flags; rth->rt_type = res->type; rth->rt_dst = daddr; - rth->rt_src = saddr; rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; @@ -1561,7 +1559,6 @@ local_input: rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; rth->rt_dst = daddr; - rth->rt_src = saddr; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; @@ -1714,7 +1711,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_flags = flags; rth->rt_type = type; rth->rt_dst = fl4->daddr; - rth->rt_src = fl4->saddr; rth->rt_route_iif = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; @@ -2005,7 +2001,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; - rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; rt->fi = ort->fi; if (rt->fi) @@ -2036,7 +2031,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 src, u8 tos, +static int rt_fill_info(struct net *net, __be32 src, struct flowi4 *fl4, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2055,7 +2050,7 @@ static int rt_fill_info(struct net *net, __be32 src, u8 tos, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = tos; + r->rtm_tos = fl4->flowi4_tos; r->rtm_table = RT_TABLE_MAIN; if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) goto nla_put_failure; @@ -2082,11 +2077,11 @@ static int rt_fill_info(struct net *net, __be32 src, u8 tos, goto nla_put_failure; #endif if (!rt_is_input_route(rt) && - rt->rt_src != src) { - if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) + fl4->saddr != src) { + if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_dst != rt->rt_gateway && + if (fl4->daddr != rt->rt_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; @@ -2116,7 +2111,7 @@ static int rt_fill_info(struct net *net, __be32 src, u8 tos, if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, - rt->rt_src, rt->rt_dst, + fl4->saddr, fl4->daddr, r, nowait); if (err <= 0) { if (!nowait) { @@ -2151,6 +2146,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; + struct flowi4 fl4; __be32 dst = 0; __be32 src = 0; u32 iif; @@ -2185,6 +2181,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst; + fl4.saddr = src; + fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_mark = mark; + if (iif) { struct net_device *dev; @@ -2205,13 +2208,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - struct flowi4 fl4 = { - .daddr = dst, - .saddr = src, - .flowi4_tos = rtm->rtm_tos, - .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - .flowi4_mark = mark, - }; rt = ip_route_output_key(net, &fl4); err = 0; @@ -2226,7 +2222,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, src, rtm->rtm_tos, skb, + err = rt_fill_info(net, src, &fl4, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 2a8d5cfc340f..00d49e415113 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -92,7 +92,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_src = rt->rt_src; xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; -- cgit v1.2.3-58-ga151 From b48698895de86e07b685f8e4b8db0f1cd5a97e9a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:03:01 +0000 Subject: ipv4: Remove 'rt_mark' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/ipmr.c | 2 +- net/ipv4/route.c | 9 ++------- net/ipv4/xfrm4_policy.c | 1 - 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 85d1093e42de..757fe40b6cea 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -52,7 +52,6 @@ struct rtable { int rt_route_iif; int rt_iif; int rt_oif; - __u32 rt_mark; /* Info on neighbour */ __be32 rt_gateway; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5716c6b808d6..eee3bf6676fe 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1797,7 +1797,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) .flowi4_tos = RT_TOS(iph->tos), .flowi4_oif = rt->rt_oif, .flowi4_iif = rt->rt_iif, - .flowi4_mark = rt->rt_mark, + .flowi4_mark = skb->mark, }; struct mr_table *mrt; int err; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fc1199dc23e7..264617c98c25 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1275,7 +1275,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; - rth->rt_mark = skb->mark; rth->rt_pmtu = 0; rth->rt_gateway = daddr; rth->fi = NULL; @@ -1395,7 +1394,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; - rth->rt_mark = skb->mark; rth->rt_pmtu = 0; rth->rt_gateway = daddr; rth->fi = NULL; @@ -1562,7 +1560,6 @@ local_input: rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; - rth->rt_mark = skb->mark; rth->rt_pmtu = 0; rth->rt_gateway = daddr; rth->fi = NULL; @@ -1714,7 +1711,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_route_iif = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; - rth->rt_mark = fl4->flowi4_mark; rth->rt_pmtu = 0; rth->rt_gateway = fl4->daddr; rth->fi = NULL; @@ -1994,7 +1990,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; - rt->rt_mark = ort->rt_mark; rt->rt_pmtu = ort->rt_pmtu; rt->rt_genid = rt_genid(net); @@ -2091,8 +2086,8 @@ static int rt_fill_info(struct net *net, __be32 src, struct flowi4 *fl4, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (rt->rt_mark && - nla_put_be32(skb, RTA_MARK, rt->rt_mark)) + if (fl4->flowi4_mark && + nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; error = rt->dst.error; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 00d49e415113..f73ba8210bd3 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -82,7 +82,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_route_iif = fl4->flowi4_iif; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.rt.rt_oif = fl4->flowi4_oif; - xdst->u.rt.rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); -- cgit v1.2.3-58-ga151 From f1ce3062c53809d862d8a04e7a0566c3cc4e0bda Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jul 2012 10:10:17 -0700 Subject: ipv4: Remove 'rt_dst' from 'struct rtable' Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/route.c | 45 +++++++++------------------------------------ net/ipv4/xfrm4_policy.c | 1 - 3 files changed, 9 insertions(+), 38 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 757fe40b6cea..6d111bceb160 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -48,7 +48,6 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; - __be32 rt_dst; /* Path destination */ int rt_route_iif; int rt_iif; int rt_oif; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 264617c98c25..85d103fee88e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -850,7 +850,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, rt->rt_iif, - &rt->rt_dst, &rt->rt_gateway); + &ip_hdr(skb)->daddr, &rt->rt_gateway); #endif } out_put_peer: @@ -1132,8 +1132,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - - if (rt->rt_gateway != rt->rt_dst && mtu > 576) + if (rt->rt_gateway != 0 && mtu > 576) mtu = 576; } @@ -1271,7 +1270,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; - rth->rt_dst = daddr; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; @@ -1390,7 +1388,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; - rth->rt_dst = daddr; rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; @@ -1556,7 +1553,6 @@ local_input: rth->rt_genid = rt_genid(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - rth->rt_dst = daddr; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; rth->rt_oif = 0; @@ -1707,7 +1703,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_genid = rt_genid(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; - rth->rt_dst = fl4->daddr; rth->rt_route_iif = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; @@ -1995,7 +1990,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_dst = ort->rt_dst; rt->rt_gateway = ort->rt_gateway; rt->fi = ort->fi; if (rt->fi) @@ -2026,9 +2020,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 src, struct flowi4 *fl4, - struct sk_buff *skb, u32 pid, u32 seq, int event, - int nowait, unsigned int flags) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct flowi4 *fl4, struct sk_buff *skb, u32 pid, + u32 seq, int event, int nowait, unsigned int flags) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2056,7 +2050,7 @@ static int rt_fill_info(struct net *net, __be32 src, struct flowi4 *fl4, if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) + if (nla_put_be32(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; @@ -2100,29 +2094,8 @@ static int rt_fill_info(struct net *net, __be32 src, struct flowi4 *fl4, } if (rt_is_input_route(rt)) { -#ifdef CONFIG_IP_MROUTE - __be32 dst = rt->rt_dst; - - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, - fl4->saddr, fl4->daddr, - r, nowait); - if (err <= 0) { - if (!nowait) { - if (err == 0) - return 0; - goto nla_put_failure; - } else { - if (err == -EMSGSIZE) - goto nla_put_failure; - error = err; - } - } - } else -#endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) - goto nla_put_failure; + if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + goto nla_put_failure; } if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) @@ -2217,7 +2190,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, src, &fl4, skb, + err = rt_fill_info(net, dst, src, &fl4, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index f73ba8210bd3..6074b694f118 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -91,7 +91,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; -- cgit v1.2.3-58-ga151 From f8126f1d5136be1ca1a3536d43ad7a710b5620f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 13 Jul 2012 05:03:45 -0700 Subject: ipv4: Adjust semantics of rt->rt_gateway. In order to allow prefixed routes, we have to adjust how rt_gateway is set and interpreted. The new interpretation is: 1) rt_gateway == 0, destination is on-link, nexthop is iph->daddr 2) rt_gateway != 0, destination requires a nexthop gateway Abstract the fetching of the proper nexthop value using a new inline helper, rt_nexthop(), as suggested by Joe Perches. Signed-off-by: David S. Miller Tested-by: Vijay Subramanian --- include/net/route.h | 7 +++++++ net/ipv4/arp.c | 3 +-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/netfilter/ipt_MASQUERADE.c | 5 +++-- net/ipv4/route.c | 17 +++++++++-------- 8 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 6d111bceb160..3c1eeab9749b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -70,6 +70,13 @@ static inline bool rt_is_output_route(const struct rtable *rt) return rt->rt_route_iif == 0; } +static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) +{ + if (rt->rt_gateway) + return rt->rt_gateway; + return daddr; +} + struct ip_rt_acct { __u32 o_bytes; __u32 o_packets; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c38293f38161..a0124eb7dbea 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) return 1; } - paddr = skb_rtable(skb)->rt_gateway; - + paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) return 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c7a4de05ca04..0a290d719bc7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -389,7 +389,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gateway) goto route_err; return &rt->dst; @@ -422,7 +422,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gateway) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 42c44b1403c9..b062a98574f2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -766,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); - dst = rt->rt_gateway; + dst = rt_nexthop(rt, old_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c528f841ca4b..4494015f7e32 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2c2c35bace76..99af1f0cc658 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -487,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_fifo_errors++; goto tx_error; } - dst = rt->rt_gateway; + dst = rt_nexthop(rt, old_iph->daddr); } rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 2f210c79dc87..cbb6a1a6f6f7 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) struct nf_nat_ipv4_range newrange; const struct nf_nat_ipv4_multi_range_compat *mr; const struct rtable *rt; - __be32 newsrc; + __be32 newsrc, nh; NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); @@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) mr = par->targinfo; rt = skb_rtable(skb); - newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + nh = rt_nexthop(rt, ip_hdr(skb)->daddr); + newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", par->out->name); return NF_DROP; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85d103fee88e..d1d579638092 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1085,8 +1085,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else - src = inet_select_addr(rt->dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); + src = inet_select_addr(rt->dst.dev, + rt_nexthop(rt, iph->daddr), + RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); @@ -1132,7 +1133,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - if (rt->rt_gateway != 0 && mtu > 576) + if (rt->rt_gateway && mtu > 576) mtu = 576; } @@ -1274,7 +1275,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_pmtu = 0; - rth->rt_gateway = daddr; + rth->rt_gateway = 0; rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; @@ -1392,7 +1393,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = in_dev->dev->ifindex; rth->rt_oif = 0; rth->rt_pmtu = 0; - rth->rt_gateway = daddr; + rth->rt_gateway = 0; rth->fi = NULL; rth->dst.input = ip_forward; @@ -1557,7 +1558,7 @@ local_input: rth->rt_iif = dev->ifindex; rth->rt_oif = 0; rth->rt_pmtu = 0; - rth->rt_gateway = daddr; + rth->rt_gateway = 0; rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1707,7 +1708,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_oif = orig_oif; rth->rt_pmtu = 0; - rth->rt_gateway = fl4->daddr; + rth->rt_gateway = 0; rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); @@ -2070,7 +2071,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (fl4->daddr != rt->rt_gateway && + if (rt->rt_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; -- cgit v1.2.3-58-ga151 From f5b0a8743601a4477419171f5046bd07d1c080a0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 Jul 2012 12:31:33 -0700 Subject: net: Document dst->obsolete better. Add a big comment explaining how the field works, and use defines instead of magic constants for the values assigned to it. Suggested by Joe Perches. Signed-off-by: David S. Miller --- include/net/dst.h | 14 +++++++++++++- net/core/dst.c | 4 ++-- net/decnet/dn_route.c | 4 ++-- net/ipv4/route.c | 5 +++-- net/ipv6/route.c | 4 ++-- net/sctp/transport.c | 2 +- net/xfrm/xfrm_policy.c | 23 ++++++++++++----------- 7 files changed, 35 insertions(+), 21 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 51610468c63d..0df661a0c295 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -65,7 +65,19 @@ struct dst_entry { unsigned short pending_confirm; short error; + + /* A non-zero value of dst->obsolete forces by-hand validation + * of the route entry. Positive values are set by the generic + * dst layer to indicate that the entry has been forcefully + * destroyed. + * + * Negative values are used by the implementation layer code to + * force invocation of the dst_ops->check() method. + */ short obsolete; +#define DST_OBSOLETE_NONE 0 +#define DST_OBSOLETE_DEAD 2 +#define DST_OBSOLETE_FORCE_CHK -1 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID @@ -359,7 +371,7 @@ extern struct dst_entry *dst_destroy(struct dst_entry *dst); static inline void dst_free(struct dst_entry *dst) { - if (dst->obsolete > 1) + if (dst->obsolete > 0) return; if (!atomic_read(&dst->__refcnt)) { dst = dst_destroy(dst); diff --git a/net/core/dst.c b/net/core/dst.c index 07bacff84aa4..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -94,7 +94,7 @@ loop: * But we do not have state "obsoleted, but * referenced by parent", so it is right. */ - if (dst->obsolete > 1) + if (dst->obsolete > 0) continue; ___dst_free(dst); @@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - dst->obsolete = 2; + dst->obsolete = DST_OBSOLETE_DEAD; } void __dst_free(struct dst_entry *dst) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 47de90d8fe94..23cc11dd4e40 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1176,7 +1176,7 @@ make_route: if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST); + rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; @@ -1444,7 +1444,7 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST); + rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST); if (rt == NULL) goto e_nobufs; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d1d579638092..50d2498c9284 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1221,7 +1221,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm) { - return dst_alloc(&ipv4_dst_ops, dev, 1, -1, + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, DST_HOST | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1969,9 +1969,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { struct dst_entry *new = &rt->dst; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 84f6564dd372..cf02cb97bbdd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -281,7 +281,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, 0, flags); + 0, DST_OBSOLETE_NONE, flags); if (rt) { struct dst_entry *dst = &rt->dst; @@ -985,7 +985,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; struct dst_entry *new = NULL; - rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0); + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { new = &rt->dst; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a6b7ee9ce28a..ec3a12b9b802 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -216,7 +216,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport, void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ - if (!transport->dst || transport->dst->obsolete > 1) { + if (!transport->dst || transport->dst->obsolete) { dst_release(transport->dst); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 65bd1ca51517..c5a5165a5927 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1350,7 +1350,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 0, 0, 0); + xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; @@ -1477,7 +1477,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; - dst1->obsolete = -1; + dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_HOST; dst1->lastuse = now; @@ -2219,12 +2219,13 @@ EXPORT_SYMBOL(__xfrm_route_forward); static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete - * to "-1" to force all XFRM destinations to get validated by - * dst_ops->check on every use. We do this because when a - * normal route referenced by an XFRM dst is obsoleted we do - * not go looking around for all parent referencing XFRM dsts - * so that we can invalidate them. It is just too much work. - * Instead we make the checks here on every use. For example: + * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to + * get validated by dst_ops->check on every use. We do this + * because when a normal route referenced by an XFRM dst is + * obsoleted we do not go looking around for all parent + * referencing XFRM dsts so that we can invalidate them. It + * is just too much work. Instead we make the checks here on + * every use. For example: * * XFRM dst A --> IPv4 dst X * @@ -2234,9 +2235,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * stale_bundle() check. * * When a policy's bundle is pruned, we dst_free() the XFRM - * dst which causes it's ->obsolete field to be set to a - * positive non-zero integer. If an XFRM dst has been pruned - * like this, we want to force a new route lookup. + * dst which causes it's ->obsolete field to be set to + * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like + * this, we want to force a new route lookup. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; -- cgit v1.2.3-58-ga151 From ceb3320610d6f15ff20dd4c042b36473d77de76f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 11:31:28 -0700 Subject: ipv4: Kill routes during PMTU/redirect updates. Mark them obsolete so there will be a re-lookup to fetch the FIB nexthop exception info. Signed-off-by: David S. Miller --- include/net/dst.h | 1 + net/ipv4/route.c | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 0df661a0c295..baf597890064 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -78,6 +78,7 @@ struct dst_entry { #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 +#define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 50d2498c9284..d52f7699c2fa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -673,7 +673,8 @@ out_unlock: return; } -static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; @@ -728,8 +729,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, 0); } - rt->rt_gateway = new_gw; - rt->rt_flags |= RTCF_REDIRECTED; + if (kill_route) + rt->dst.obsolete = DST_OBSOLETE_KILL; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); @@ -760,7 +761,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; ip_rt_build_flow_key(&fl4, sk, skb); - __ip_do_redirect(rt, skb, &fl4); + __ip_do_redirect(rt, skb, &fl4, true); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -919,7 +920,7 @@ out: kfree_skb(skb); return 0; } -static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct fib_result res; @@ -932,8 +933,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } - rt->rt_pmtu = mtu; - dst_set_expires(&rt->dst, ip_rt_mtu_expires); + return mtu; } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -943,7 +943,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); - __ip_rt_update_pmtu(rt, &fl4, mtu); + mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); + + if (!rt->rt_pmtu) { + dst->obsolete = DST_OBSOLETE_KILL; + } else { + rt->rt_pmtu = mtu; + dst_set_expires(&rt->dst, ip_rt_mtu_expires); + } } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, @@ -989,7 +996,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - __ip_do_redirect(rt, skb, &fl4); + __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } @@ -1004,7 +1011,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { - __ip_do_redirect(rt, skb, &fl4); + __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } @@ -1014,7 +1021,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; - if (rt_is_expired(rt)) + /* All IPV4 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + * + * When a PMTU/redirect information update invalidates a + * route, this is indicated by setting obsolete to + * DST_OBSOLETE_KILL. + */ + if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) return NULL; return dst; } @@ -1186,8 +1201,10 @@ restart: dst_set_expires(&rt->dst, diff); } } - if (gw) + if (gw) { + rt->rt_flags |= RTCF_REDIRECTED; rt->rt_gateway = gw; + } fnhe->fnhe_stamp = jiffies; break; } -- cgit v1.2.3-58-ga151 From f2bb4bedf35d5167a073dcdddf16543f351ef3ae Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:20:47 -0700 Subject: ipv4: Cache output routes in fib_info nexthops. If we have an output route that lacks nexthop exceptions, we can cache it in the FIB info nexthop. Such routes will have DST_HOST cleared because such routes refer to a family of destinations, rather than just one. The sequence of the handling of exceptions during route lookup is adjusted to make the logic work properly. Before we allocate the route, we lookup the exception. Then we know if we will cache this route or not, and therefore whether DST_HOST should be set on the allocated route. Then we use DST_HOST to key off whether we should store the resulting route, during rt_set_nexthop(), in the FIB nexthop cache. With help from Eric Dumazet. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 + net/ipv4/fib_semantics.c | 2 + net/ipv4/route.c | 140 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 101 insertions(+), 43 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 2daf096dfc60..fb62c590360e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -46,6 +46,7 @@ struct fib_config { }; struct fib_info; +struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; @@ -80,6 +81,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; + struct rtable *nh_rth_output; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b57d768240d..83d0f42b619a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -171,6 +171,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); + if (nexthop_nh->nh_rth_output) + dst_release(&nexthop_nh->nh_rth_output->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d52f7699c2fa..8a0260010ea1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1158,8 +1158,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu; } -static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, - struct fib_info *fi) +static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) { if (fi->fib_metrics != (u32 *) dst_default_metrics) { rt->fi = fi; @@ -1168,50 +1167,83 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, fi->fib_metrics, true); } -static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr) +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { struct fnhe_hash_bucket *hash = nh->nh_exceptions; struct fib_nh_exception *fnhe; u32 hval; + if (!hash) + return NULL; + hval = fnhe_hashfun(daddr); -restart: for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { - __be32 fnhe_daddr, gw; - unsigned long expires; - unsigned int seq; - u32 pmtu; - - seq = read_seqbegin(&fnhe_seqlock); - fnhe_daddr = fnhe->fnhe_daddr; - gw = fnhe->fnhe_gw; - pmtu = fnhe->fnhe_pmtu; - expires = fnhe->fnhe_expires; - if (read_seqretry(&fnhe_seqlock, seq)) - goto restart; - if (daddr != fnhe_daddr) - continue; - if (pmtu) { - unsigned long diff = expires - jiffies; + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; +} - if (time_before(jiffies, expires)) { - rt->rt_pmtu = pmtu; - dst_set_expires(&rt->dst, diff); - } - } - if (gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = gw; +static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) +{ + __be32 fnhe_daddr, gw; + unsigned long expires; + unsigned int seq; + u32 pmtu; + +restart: + seq = read_seqbegin(&fnhe_seqlock); + fnhe_daddr = fnhe->fnhe_daddr; + gw = fnhe->fnhe_gw; + pmtu = fnhe->fnhe_pmtu; + expires = fnhe->fnhe_expires; + if (read_seqretry(&fnhe_seqlock, seq)) + goto restart; + + if (daddr != fnhe_daddr) + return; + + if (pmtu) { + unsigned long diff = expires - jiffies; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = pmtu; + dst_set_expires(&rt->dst, diff); } - fnhe->fnhe_stamp = jiffies; - break; + } + if (gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = gw; + } + fnhe->fnhe_stamp = jiffies; +} + +static inline void rt_release_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_release(dst); +} + +static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p = &nh->nh_rth_output; + + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + dst_clone(&rt->dst); + if (orig) + call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); } } -static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, + struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { if (fi) { @@ -1219,12 +1251,15 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) rt->rt_gateway = nh->nh_gw; - if (unlikely(nh->nh_exceptions)) - rt_bind_exception(rt, nh, fl4->daddr); - rt_init_metrics(rt, fl4, fi); + if (unlikely(fnhe)) + rt_bind_exception(rt, fnhe, daddr); + rt_init_metrics(rt, fi); #ifdef CONFIG_IP_ROUTE_CLASSID - rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = nh->nh_tclassid; #endif + if (!(rt->dst.flags & DST_HOST) && + rt_is_output_route(rt)) + rt_cache_route(nh, rt); } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -1236,10 +1271,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, } static struct rtable *rt_dst_alloc(struct net_device *dev, - bool nopolicy, bool noxfrm) + bool nopolicy, bool noxfrm, bool will_cache) { return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - DST_HOST | DST_NOCACHE | + (will_cache ? 0 : DST_HOST) | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1276,7 +1311,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_err; } rth = rt_dst_alloc(dev_net(dev)->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1349,6 +1384,7 @@ static int __mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; @@ -1395,9 +1431,13 @@ static int __mkroute_input(struct sk_buff *skb, } } + fnhe = NULL; + if (res->fi) + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM)); + IN_DEV_CONF_GET(out_dev, NOXFRM), false); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -1416,7 +1456,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.input = ip_forward; rth->dst.output = ip_output; - rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); *result = rth; err = 0; @@ -1558,7 +1598,7 @@ brd_input: local_input: rth = rt_dst_alloc(net->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1672,6 +1712,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, unsigned int flags) { struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; @@ -1710,9 +1751,22 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fi = NULL; } + fnhe = NULL; + if (fi) { + fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + if (!fnhe) { + rth = FIB_RES_NH(*res).nh_rth_output; + if (rth && + rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) { + dst_use(&rth->dst, jiffies); + return rth; + } + } + } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM)); + IN_DEV_CONF_GET(in_dev, NOXFRM), + fi && !fnhe); if (!rth) return ERR_PTR(-ENOBUFS); @@ -1749,7 +1803,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #endif } - rt_set_nexthop(rth, fl4, res, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) rth->dst.flags |= DST_NOCACHE; -- cgit v1.2.3-58-ga151 From d2d68ba9fe8b38eb03124b3176a013bb8aa2b5e5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:58:50 -0700 Subject: ipv4: Cache input routes in fib_info nexthops. Caching input routes is slightly simpler than output routes, since we don't need to be concerned with nexthop exceptions. (locally destined, and routed packets, never trigger PMTU events or redirects that will be processed by us). However, we have to elide caching for the DIRECTSRC and non-zero itag cases. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 55 +++++++++++++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index fb62c590360e..e69c3a47153d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -82,6 +82,7 @@ struct fib_nh { __be32 nh_saddr; int nh_saddr_genid; struct rtable *nh_rth_output; + struct rtable *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 83d0f42b619a..e55171f184f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -173,6 +173,8 @@ static void free_fib_info_rcu(struct rcu_head *head) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) dst_release(&nexthop_nh->nh_rth_output->dst); + if (nexthop_nh->nh_rth_input) + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8a0260010ea1..97cca8a03d94 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1231,6 +1231,9 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; + if (rt_is_input_route(rt)) + p = &nh->nh_rth_input; + orig = *p; prev = cmpxchg(p, orig, rt); @@ -1241,6 +1244,11 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) } } +static bool rt_cache_valid(struct rtable *rt) +{ + return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); +} + static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, @@ -1257,8 +1265,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_HOST) && - rt_is_output_route(rt)) + if (!(rt->dst.flags & DST_HOST)) rt_cache_route(nh, rt); } @@ -1384,11 +1391,11 @@ static int __mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { - struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; unsigned int flags = 0; + bool do_cache; u32 itag; /* get a working reference to the output device */ @@ -1431,13 +1438,21 @@ static int __mkroute_input(struct sk_buff *skb, } } - fnhe = NULL; - if (res->fi) - fnhe = find_exception(&FIB_RES_NH(*res), daddr); + do_cache = false; + if (res->fi) { + if (!(flags & RTCF_DIRECTSRC) && !itag) { + rth = FIB_RES_NH(*res).nh_rth_input; + if (rt_cache_valid(rth)) { + dst_use(&rth->dst, jiffies); + goto out; + } + do_cache = true; + } + } rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM), false); + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -1456,8 +1471,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.input = ip_forward; rth->dst.output = ip_output; - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - + rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); +out: *result = rth; err = 0; cleanup: @@ -1509,6 +1524,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; int err = -EINVAL; struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -1522,6 +1538,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; + res.fi = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1597,8 +1614,20 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: + do_cache = false; + if (res.fi) { + if (!(flags & RTCF_DIRECTSRC) && !itag) { + rth = FIB_RES_NH(res).nh_rth_input; + if (rt_cache_valid(rth)) { + dst_use(&rth->dst, jiffies); + goto set_and_out; + } + do_cache = true; + } + } + rth = rt_dst_alloc(net->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1622,6 +1651,9 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (do_cache) + rt_cache_route(&FIB_RES_NH(res), rth); +set_and_out: skb_dst_set(skb, &rth->dst); err = 0; goto out; @@ -1756,8 +1788,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); if (!fnhe) { rth = FIB_RES_NH(*res).nh_rth_output; - if (rth && - rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) { + if (rt_cache_valid(rth)) { dst_use(&rth->dst, jiffies); return rth; } -- cgit v1.2.3-58-ga151 From ba3f7f04ef2b19aace38f855aedd17fe43035d50 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:02:46 -0700 Subject: ipv4: Kill FLOWI_FLAG_RT_NOCACHE and associated code. Signed-off-by: David S. Miller --- include/net/flow.h | 1 - include/net/inet_connection_sock.h | 3 +-- net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 5 +---- net/ipv4/route.c | 3 --- net/ipv4/tcp_ipv4.c | 4 ++-- 6 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index ce9cb7656b47..e1dd5082ec7e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -21,7 +21,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_CAN_SLEEP 0x02 -#define FLOWI_FLAG_RT_NOCACHE 0x04 __u32 flowic_secid; }; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2cf44b4ed2e6..5ee66f517b4f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum); extern struct dst_entry* inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req, - bool nocache); + const struct request_sock *req); extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, const struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ab4f44c9bb21..25428d0c50c9 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -508,7 +508,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, struct dst_entry *dst; struct flowi4 fl4; - dst = inet_csk_route_req(sk, &fl4, req, false); + dst = inet_csk_route_req(sk, &fl4, req); if (dst == NULL) goto out; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0a290d719bc7..db0cf17c00f7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req, - bool nocache) + const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct net *net = sock_net(sk); int flags = inet_sk_flowi_flags(sk); - if (nocache) - flags |= FLOWI_FLAG_RT_NOCACHE; flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 97cca8a03d94..7e1c0ed0ef70 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1836,9 +1836,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) - rth->dst.flags |= DST_NOCACHE; - return rth; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1d8b75a58981..59110caeb074 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -824,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -1378,7 +1378,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && + (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && fl4.daddr == saddr) { if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); -- cgit v1.2.3-58-ga151 From 93ac53410a82a4f1bf2baf9d65d95cc29f2774ca Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:09:39 -0700 Subject: ipv4: Dirty less cache lines in route caching paths. Don't bother incrementing dst->__use and setting dst->lastuse, they are completely pointless and just slow things down. Signed-off-by: David S. Miller --- net/ipv4/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7e1c0ed0ef70..b8707779b85d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1443,7 +1443,7 @@ static int __mkroute_input(struct sk_buff *skb, if (!(flags & RTCF_DIRECTSRC) && !itag) { rth = FIB_RES_NH(*res).nh_rth_input; if (rt_cache_valid(rth)) { - dst_use(&rth->dst, jiffies); + dst_hold(&rth->dst); goto out; } do_cache = true; @@ -1619,7 +1619,7 @@ local_input: if (!(flags & RTCF_DIRECTSRC) && !itag) { rth = FIB_RES_NH(res).nh_rth_input; if (rt_cache_valid(rth)) { - dst_use(&rth->dst, jiffies); + dst_hold(&rth->dst); goto set_and_out; } do_cache = true; @@ -1789,7 +1789,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (!fnhe) { rth = FIB_RES_NH(*res).nh_rth_output; if (rt_cache_valid(rth)) { - dst_use(&rth->dst, jiffies); + dst_hold(&rth->dst); return rth; } } -- cgit v1.2.3-58-ga151 From 4fd551d7bed93af60af61c5a324b8f5dff37953a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:39:44 -0700 Subject: ipv4: Kill rt->rt_oif Never actually used. It was being set on output routes to the original OIF specified in the flow key used for the lookup. Adjust the only user, ipmr_rt_fib_lookup(), for greater correctness of the flowi4_oif and flowi4_iif values, thanks to feedback from Julian Anastasov. Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/ipmr.c | 7 +++++-- net/ipv4/route.c | 5 ----- net/ipv4/xfrm4_policy.c | 1 - 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 3c1eeab9749b..e789a92fd602 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -50,7 +50,6 @@ struct rtable { int rt_route_iif; int rt_iif; - int rt_oif; /* Info on neighbour */ __be32 rt_gateway; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index eee3bf6676fe..8eec8f4a0536 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1795,8 +1795,11 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = RT_TOS(iph->tos), - .flowi4_oif = rt->rt_oif, - .flowi4_iif = rt->rt_iif, + .flowi4_oif = (rt_is_output_route(rt) ? + skb->dev->ifindex : 0), + .flowi4_iif = (rt_is_output_route(rt) ? + net->loopback_dev->ifindex : + skb->dev->ifindex), .flowi4_mark = skb->mark, }; struct mr_table *mrt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b8707779b85d..a280b6ac8eb2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1332,7 +1332,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_type = RTN_MULTICAST; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; - rth->rt_oif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; rth->fi = NULL; @@ -1463,7 +1462,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_type = res->type; rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; - rth->rt_oif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; rth->fi = NULL; @@ -1642,7 +1640,6 @@ local_input: rth->rt_type = res.type; rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; - rth->rt_oif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; rth->fi = NULL; @@ -1808,7 +1805,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_type = type; rth->rt_route_iif = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; - rth->rt_oif = orig_oif; rth->rt_pmtu = 0; rth->rt_gateway = 0; rth->fi = NULL; @@ -2085,7 +2081,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif; - rt->rt_oif = ort->rt_oif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_genid = rt_genid(net); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6074b694f118..3c99b4cdd290 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -81,7 +81,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_route_iif = fl4->flowi4_iif; xdst->u.rt.rt_iif = fl4->flowi4_iif; - xdst->u.rt.rt_oif = fl4->flowi4_oif; xdst->u.dst.dev = dev; dev_hold(dev); -- cgit v1.2.3-58-ga151 From 9917e1e8762745191eba5a3bf2040278cbddbee1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:44:26 -0700 Subject: ipv4: Turn rt->rt_route_iif into rt->rt_is_input. That is this value's only use, as a boolean to indicate whether a route is an input route or not. So implement it that way, using a u16 gap present in the struct already. Signed-off-by: David S. Miller --- include/net/route.h | 6 +++--- net/ipv4/route.c | 10 +++++----- net/ipv4/xfrm4_policy.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index e789a92fd602..4bafe0bfe829 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -47,8 +47,8 @@ struct rtable { int rt_genid; unsigned int rt_flags; __u16 rt_type; + __u16 rt_is_input; - int rt_route_iif; int rt_iif; /* Info on neighbour */ @@ -61,12 +61,12 @@ struct rtable { static inline bool rt_is_input_route(const struct rtable *rt) { - return rt->rt_route_iif != 0; + return rt->rt_is_input != 0; } static inline bool rt_is_output_route(const struct rtable *rt) { - return rt->rt_route_iif == 0; + return rt->rt_is_input == 0; } static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a280b6ac8eb2..fac4c4acdbac 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1330,7 +1330,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; - rth->rt_route_iif = dev->ifindex; + rth->rt_is_input= 1; rth->rt_iif = dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; @@ -1460,7 +1460,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; - rth->rt_route_iif = in_dev->dev->ifindex; + rth->rt_is_input = 1; rth->rt_iif = in_dev->dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; @@ -1638,7 +1638,7 @@ local_input: rth->rt_genid = rt_genid(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - rth->rt_route_iif = dev->ifindex; + rth->rt_is_input = 1; rth->rt_iif = dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; @@ -1803,7 +1803,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_genid = rt_genid(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; - rth->rt_route_iif = 0; + rth->rt_is_input = 0; rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; @@ -2079,7 +2079,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or if (new->dev) dev_hold(new->dev); - rt->rt_route_iif = ort->rt_route_iif; + rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 3c99b4cdd290..c6281847f16a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -79,7 +79,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.rt_route_iif = fl4->flowi4_iif; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.dst.dev = dev; @@ -87,6 +86,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ + xdst->u.rt.rt_is_input = rt->rt_is_input; xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; -- cgit v1.2.3-58-ga151 From 2860583fe840d972573363dfa190b2149a604534 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:55:59 -0700 Subject: ipv4: Kill rt->fi It's not really needed. We only grabbed a reference to the fib_info for the sake of fib_info local metrics. However, fib_info objects are freed using RCU, as are therefore their private metrics (if any). We would have triggered a route cache flush if we eliminated a reference to a fib_info object in the routing tables. Therefore, any existing cached routes will first check and see that they have been invalidated before an errant reference to these metric values would occur. Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/route.c | 32 +------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 4bafe0bfe829..60d611dc5cee 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -56,7 +56,6 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; - struct fib_info *fi; /* for client ref to shared metrics */ }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fac4c4acdbac..9add08869c75 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -141,7 +141,6 @@ static int ip_rt_min_advmss __read_mostly = 256; static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); static unsigned int ipv4_mtu(const struct dst_entry *dst); -static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -171,7 +170,6 @@ static struct dst_ops ipv4_dst_ops = { .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, - .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, @@ -1034,17 +1032,6 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return dst; } -static void ipv4_dst_destroy(struct dst_entry *dst) -{ - struct rtable *rt = (struct rtable *) dst; - - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } -} - - static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; @@ -1158,15 +1145,6 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu; } -static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) -{ - if (fi->fib_metrics != (u32 *) dst_default_metrics) { - rt->fi = fi; - atomic_inc(&fi->fib_clntref); - } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); -} - static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { struct fnhe_hash_bucket *hash = nh->nh_exceptions; @@ -1261,7 +1239,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; if (unlikely(fnhe)) rt_bind_exception(rt, fnhe, daddr); - rt_init_metrics(rt, fi); + dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1334,7 +1312,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; - rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1464,7 +1441,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = in_dev->dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; - rth->fi = NULL; rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1642,7 +1618,6 @@ local_input: rth->rt_iif = dev->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; - rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -1807,7 +1782,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : dev_out->ifindex; rth->rt_pmtu = 0; rth->rt_gateway = 0; - rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); @@ -2052,7 +2026,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, @@ -2087,9 +2060,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; - rt->fi = ort->fi; - if (rt->fi) - atomic_inc(&rt->fi->fib_clntref); dst_free(new); } -- cgit v1.2.3-58-ga151