From c7066f70d9610df0b9406cc635fc09e86136e714 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 14 Jan 2011 13:36:42 +0100 Subject: netfilter: fix Kconfig dependencies Fix dependencies of netfilter realm match: it depends on NET_CLS_ROUTE, which itself depends on NET_SCHED; this dependency is missing from netfilter. Since matching on realms is also useful without having NET_SCHED enabled and the option really only controls whether the tclassid member is included in route and dst entries, rename the config option to IP_ROUTE_CLASSID and move it outside of traffic scheduling context to get rid of the NET_SCHED dependeny. Reported-by: Vladis Kletnieks Signed-off-by: Patrick McHardy --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index a5bd72646d65..6baba836ad8b 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -72,7 +72,7 @@ struct dst_entry { u32 metrics[RTAX_MAX]; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; -- cgit From 62fa8a846d7de4b299232e330c74b7783539df76 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 26 Jan 2011 20:51:05 -0800 Subject: net: Implement read-only protection and COW'ing of metrics. Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller --- include/net/dst.h | 114 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 37 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index be5a0d4c491d..94a8c234ea2a 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -40,24 +40,10 @@ struct dst_entry { struct rcu_head rcu_head; struct dst_entry *child; struct net_device *dev; - short error; - short obsolete; - int flags; -#define DST_HOST 0x0001 -#define DST_NOXFRM 0x0002 -#define DST_NOPOLICY 0x0004 -#define DST_NOHASH 0x0008 -#define DST_NOCACHE 0x0010 + struct dst_ops *ops; + unsigned long _metrics; unsigned long expires; - - unsigned short header_len; /* more space at head required */ - unsigned short trailer_len; /* space to reserve at tail */ - - unsigned int rate_tokens; - unsigned long rate_last; /* rate limiting for ICMP */ - struct dst_entry *path; - struct neighbour *neighbour; struct hh_cache *hh; #ifdef CONFIG_XFRM @@ -68,17 +54,16 @@ struct dst_entry { int (*input)(struct sk_buff*); int (*output)(struct sk_buff*); - struct dst_ops *ops; - - u32 _metrics[RTAX_MAX]; - + short error; + short obsolete; + unsigned short header_len; /* more space at head required */ + unsigned short trailer_len; /* space to reserve at tail */ #ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; #endif - /* * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) @@ -93,6 +78,14 @@ struct dst_entry { atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; + unsigned long rate_last; /* rate limiting for ICMP */ + unsigned int rate_tokens; + int flags; +#define DST_HOST 0x0001 +#define DST_NOXFRM 0x0002 +#define DST_NOPOLICY 0x0004 +#define DST_NOHASH 0x0008 +#define DST_NOCACHE 0x0010 union { struct dst_entry *next; struct rtable __rcu *rt_next; @@ -103,10 +96,69 @@ struct dst_entry { #ifdef __KERNEL__ +extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); + +#define DST_METRICS_READ_ONLY 0x1UL +#define __DST_METRICS_PTR(Y) \ + ((u32 *)((Y) & ~DST_METRICS_READ_ONLY)) +#define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) + +static inline bool dst_metrics_read_only(const struct dst_entry *dst) +{ + return dst->_metrics & DST_METRICS_READ_ONLY; +} + +extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); + +static inline void dst_destroy_metrics_generic(struct dst_entry *dst) +{ + unsigned long val = dst->_metrics; + if (!(val & DST_METRICS_READ_ONLY)) + __dst_destroy_metrics_generic(dst, val); +} + +static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) +{ + unsigned long p = dst->_metrics; + + if (p & DST_METRICS_READ_ONLY) + return dst->ops->cow_metrics(dst, p); + return __DST_METRICS_PTR(p); +} + +/* This may only be invoked before the entry has reached global + * visibility. + */ +static inline void dst_init_metrics(struct dst_entry *dst, + const u32 *src_metrics, + bool read_only) +{ + dst->_metrics = ((unsigned long) src_metrics) | + (read_only ? DST_METRICS_READ_ONLY : 0); +} + +static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) +{ + u32 *dst_metrics = dst_metrics_write_ptr(dest); + + if (dst_metrics) { + u32 *src_metrics = DST_METRICS_PTR(src); + + memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); + } +} + +static inline u32 *dst_metrics_ptr(struct dst_entry *dst) +{ + return DST_METRICS_PTR(dst); +} + static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { - return dst->_metrics[metric-1]; + u32 *p = DST_METRICS_PTR(dst); + + return p[metric-1]; } static inline u32 @@ -131,22 +183,10 @@ dst_metric_advmss(const struct dst_entry *dst) static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { - dst->_metrics[metric-1] = val; -} - -static inline void dst_import_metrics(struct dst_entry *dst, const u32 *src_metrics) -{ - memcpy(dst->_metrics, src_metrics, RTAX_MAX * sizeof(u32)); -} + u32 *p = dst_metrics_write_ptr(dst); -static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) -{ - dst_import_metrics(dest, src->_metrics); -} - -static inline u32 *dst_metrics_ptr(struct dst_entry *dst) -{ - return dst->_metrics; + if (p) + p[metric-1] = val; } static inline u32 -- cgit From 725d1e1b457dc2bbebb337677e73efe7c6d14da5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 28 Jan 2011 14:05:05 -0800 Subject: ipv4: Attach FIB info to dst_default_metrics when possible If there are no explicit metrics attached to a route, hook fi->fib_info up to dst_default_metrics. Signed-off-by: David S. Miller --- include/net/dst.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 94a8c234ea2a..484f80b69ada 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -97,6 +97,7 @@ struct dst_entry { #ifdef __KERNEL__ extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); +extern const u32 dst_default_metrics[RTAX_MAX]; #define DST_METRICS_READ_ONLY 0x1UL #define __DST_METRICS_PTR(Y) \ -- cgit From 92d8682926342d2b6aa5b2ecc02221e00e1573a0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 4 Feb 2011 15:55:25 -0800 Subject: inetpeer: Move ICMP rate limiting state into inet_peer entries. Like metrics, the ICMP rate limiting bits are cached state about a destination. So move it into the inet_peer entries. If an inet_peer cannot be bound (the reason is memory allocation failure or similar), the policy is to allow. Signed-off-by: David S. Miller --- include/net/dst.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 484f80b69ada..e550195d4f86 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -78,8 +78,6 @@ struct dst_entry { atomic_t __refcnt; /* client references */ int __use; unsigned long lastuse; - unsigned long rate_last; /* rate limiting for ICMP */ - unsigned int rate_tokens; int flags; #define DST_HOST 0x0001 #define DST_NOXFRM 0x0002 -- cgit From e7b66bdc02592f5573ade667e4d68ac6e7b0f9e1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Feb 2011 15:33:22 -0800 Subject: net: Remove bogus barrier() in dst_allfrag(). I simply missed this one when modifying the other dst metric interfaces earlier. Signed-off-by: David S. Miller --- include/net/dst.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index e550195d4f86..e01855de21e8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -220,8 +220,6 @@ static inline u32 dst_allfrag(const struct dst_entry *dst) { int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); - /* Yes, _exactly_. This is paranoia. */ - barrier(); return ret; } -- cgit From 3c7bd1a14071b99d6535b710bc998ae5d3abbb66 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 16 Feb 2011 14:08:44 -0800 Subject: net: Add initial_ref arg to dst_alloc(). This allows avoiding multiple writes to the initial __refcnt. The most simplest cases of wanting an initial reference of "1" in ipv4 and ipv6 have been converted, the rest have been left along and kept at the existing "0". Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index e01855de21e8..23b564d3e110 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -352,7 +352,7 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) } extern int dst_discard(struct sk_buff *skb); -extern void * dst_alloc(struct dst_ops * ops); +extern void *dst_alloc(struct dst_ops * ops, int initial_ref); extern void __dst_free(struct dst_entry * dst); extern struct dst_entry *dst_destroy(struct dst_entry * dst); -- cgit From dee9f4bceb5fd9dbfcc1567148fccdbf16d6a38a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 22 Feb 2011 18:44:31 -0800 Subject: net: Make flow cache paths use a const struct flowi. Signed-off-by: David S. Miller --- include/net/dst.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 23b564d3e110..4fedffd7c56f 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -428,20 +428,22 @@ enum { struct flowi; #ifndef CONFIG_XFRM static inline int xfrm_lookup(struct net *net, struct dst_entry **dst_p, - struct flowi *fl, struct sock *sk, int flags) + const struct flowi *fl, struct sock *sk, + int flags) { return 0; } static inline int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, - struct flowi *fl, struct sock *sk, int flags) + const struct flowi *fl, struct sock *sk, + int flags) { return 0; } #else extern int xfrm_lookup(struct net *net, struct dst_entry **dst_p, - struct flowi *fl, struct sock *sk, int flags); + const struct flowi *fl, struct sock *sk, int flags); extern int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, - struct flowi *fl, struct sock *sk, int flags); + const struct flowi *fl, struct sock *sk, int flags); #endif #endif -- cgit From 80c0bc9e37adfc892af82cb6aa8cace79f8a96cb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Mar 2011 14:36:37 -0800 Subject: xfrm: Kill XFRM_LOOKUP_WAIT flag. This can be determined from the flow flags instead. Signed-off-by: David S. Miller --- include/net/dst.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 4fedffd7c56f..15d67c8d3ce8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -421,8 +421,7 @@ extern void dst_init(void); /* Flags for xfrm_lookup flags argument. */ enum { - XFRM_LOOKUP_WAIT = 1 << 0, - XFRM_LOOKUP_ICMP = 1 << 1, + XFRM_LOOKUP_ICMP = 1 << 0, }; struct flowi; -- cgit From 2774c131b1d19920b4587db1cfbd6f0750ad1f15 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Mar 2011 14:59:04 -0800 Subject: xfrm: Handle blackhole route creation via afinfo. That way we don't have to potentially do this in every xfrm_lookup() caller. Signed-off-by: David S. Miller --- include/net/dst.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 15d67c8d3ce8..8948452132ad 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -432,17 +432,9 @@ static inline int xfrm_lookup(struct net *net, struct dst_entry **dst_p, { return 0; } -static inline int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, - const struct flowi *fl, struct sock *sk, - int flags) -{ - return 0; -} #else extern int xfrm_lookup(struct net *net, struct dst_entry **dst_p, const struct flowi *fl, struct sock *sk, int flags); -extern int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, - const struct flowi *fl, struct sock *sk, int flags); #endif #endif -- cgit From 452edd598f60522c11f7f88fdbab27eb36509d1a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 2 Mar 2011 13:27:41 -0800 Subject: xfrm: Return dst directly from xfrm_lookup() Instead of on the stack. Signed-off-by: David S. Miller --- include/net/dst.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/net/dst.h') diff --git a/include/net/dst.h b/include/net/dst.h index 8948452132ad..2a46cbaef92d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -426,15 +426,17 @@ enum { struct flowi; #ifndef CONFIG_XFRM -static inline int xfrm_lookup(struct net *net, struct dst_entry **dst_p, - const struct flowi *fl, struct sock *sk, - int flags) +static inline struct dst_entry *xfrm_lookup(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, struct sock *sk, + int flags) { - return 0; + return dst_orig; } #else -extern int xfrm_lookup(struct net *net, struct dst_entry **dst_p, - const struct flowi *fl, struct sock *sk, int flags); +extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, struct sock *sk, + int flags); #endif #endif -- cgit