summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-01-26 20:51:05 -0800
committerDavid S. Miller <davem@davemloft.net>2011-01-26 20:51:05 -0800
commit62fa8a846d7de4b299232e330c74b7783539df76 (patch)
treee401dbdbf4b11cbd27bdc3a47d9dc8b512173c9f /include
parentb4e69ac670d71b5748dc81e536b2cb103489badd (diff)
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/net/dst.h114
-rw-r--r--include/net/dst_ops.h1
-rw-r--r--include/net/route.h2
3 files changed, 80 insertions, 37 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index be5a0d4c491d..94a8c234ea2a 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -40,24 +40,10 @@ struct dst_entry {
struct rcu_head rcu_head;
struct dst_entry *child;
struct net_device *dev;
- short error;
- short obsolete;
- int flags;
-#define DST_HOST 0x0001
-#define DST_NOXFRM 0x0002
-#define DST_NOPOLICY 0x0004
-#define DST_NOHASH 0x0008
-#define DST_NOCACHE 0x0010
+ struct dst_ops *ops;
+ unsigned long _metrics;
unsigned long expires;
-
- unsigned short header_len; /* more space at head required */
- unsigned short trailer_len; /* space to reserve at tail */
-
- unsigned int rate_tokens;
- unsigned long rate_last; /* rate limiting for ICMP */
-
struct dst_entry *path;
-
struct neighbour *neighbour;
struct hh_cache *hh;
#ifdef CONFIG_XFRM
@@ -68,17 +54,16 @@ struct dst_entry {
int (*input)(struct sk_buff*);
int (*output)(struct sk_buff*);
- struct dst_ops *ops;
-
- u32 _metrics[RTAX_MAX];
-
+ short error;
+ short obsolete;
+ unsigned short header_len; /* more space at head required */
+ unsigned short trailer_len; /* space to reserve at tail */
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 tclassid;
#else
__u32 __pad2;
#endif
-
/*
* Align __refcnt to a 64 bytes alignment
* (L1_CACHE_SIZE would be too much)
@@ -93,6 +78,14 @@ struct dst_entry {
atomic_t __refcnt; /* client references */
int __use;
unsigned long lastuse;
+ unsigned long rate_last; /* rate limiting for ICMP */
+ unsigned int rate_tokens;
+ int flags;
+#define DST_HOST 0x0001
+#define DST_NOXFRM 0x0002
+#define DST_NOPOLICY 0x0004
+#define DST_NOHASH 0x0008
+#define DST_NOCACHE 0x0010
union {
struct dst_entry *next;
struct rtable __rcu *rt_next;
@@ -103,10 +96,69 @@ struct dst_entry {
#ifdef __KERNEL__
+extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
+
+#define DST_METRICS_READ_ONLY 0x1UL
+#define __DST_METRICS_PTR(Y) \
+ ((u32 *)((Y) & ~DST_METRICS_READ_ONLY))
+#define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics)
+
+static inline bool dst_metrics_read_only(const struct dst_entry *dst)
+{
+ return dst->_metrics & DST_METRICS_READ_ONLY;
+}
+
+extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);
+
+static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
+{
+ unsigned long val = dst->_metrics;
+ if (!(val & DST_METRICS_READ_ONLY))
+ __dst_destroy_metrics_generic(dst, val);
+}
+
+static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
+{
+ unsigned long p = dst->_metrics;
+
+ if (p & DST_METRICS_READ_ONLY)
+ return dst->ops->cow_metrics(dst, p);
+ return __DST_METRICS_PTR(p);
+}
+
+/* This may only be invoked before the entry has reached global
+ * visibility.
+ */
+static inline void dst_init_metrics(struct dst_entry *dst,
+ const u32 *src_metrics,
+ bool read_only)
+{
+ dst->_metrics = ((unsigned long) src_metrics) |
+ (read_only ? DST_METRICS_READ_ONLY : 0);
+}
+
+static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
+{
+ u32 *dst_metrics = dst_metrics_write_ptr(dest);
+
+ if (dst_metrics) {
+ u32 *src_metrics = DST_METRICS_PTR(src);
+
+ memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
+ }
+}
+
+static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
+{
+ return DST_METRICS_PTR(dst);
+}
+
static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
- return dst->_metrics[metric-1];
+ u32 *p = DST_METRICS_PTR(dst);
+
+ return p[metric-1];
}
static inline u32
@@ -131,22 +183,10 @@ dst_metric_advmss(const struct dst_entry *dst)
static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
- dst->_metrics[metric-1] = val;
-}
-
-static inline void dst_import_metrics(struct dst_entry *dst, const u32 *src_metrics)
-{
- memcpy(dst->_metrics, src_metrics, RTAX_MAX * sizeof(u32));
-}
+ u32 *p = dst_metrics_write_ptr(dst);
-static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
-{
- dst_import_metrics(dest, src->_metrics);
-}
-
-static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
-{
- return dst->_metrics;
+ if (p)
+ p[metric-1] = val;
}
static inline u32
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 21a320b8708e..dc0746328947 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -18,6 +18,7 @@ struct dst_ops {
struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
unsigned int (*default_advmss)(const struct dst_entry *);
unsigned int (*default_mtu)(const struct dst_entry *);
+ u32 * (*cow_metrics)(struct dst_entry *, unsigned long);
void (*destroy)(struct dst_entry *);
void (*ifdown)(struct dst_entry *,
struct net_device *dev, int how);
diff --git a/include/net/route.h b/include/net/route.h
index 93e10c453f6b..5677cbf0c6e6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -49,6 +49,7 @@
struct fib_nh;
struct inet_peer;
+struct fib_info;
struct rtable {
struct dst_entry dst;
@@ -69,6 +70,7 @@ struct rtable {
/* Miscellaneous cached information */
__be32 rt_spec_dst; /* RFC1122 specific destination */
struct inet_peer *peer; /* long-living peer info */
+ struct fib_info *fi; /* for client ref to shared metrics */
};
static inline bool rt_is_input_route(struct rtable *rt)