From e0f802fbcaa3bffe4728e37a8fa1279b5d554173 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 17 Jun 2014 11:25:37 +0300 Subject: tcp: move ir_mark initialization to tcp_openreq_init ir_mark initialization is done for both TCP v4 and v6, move it in the common tcp_openreq_init function. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77cccda1ad0c..180336d47df6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1311,14 +1311,13 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); + tcp_openreq_init(req, &tmp_opt, skb, sk); ireq = inet_rsk(req); ireq->ir_loc_addr = daddr; ireq->ir_rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); - ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; -- cgit From 7200135bc1e61f1437dc326ae2ef2f310c50b4eb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 16 Jun 2014 13:01:52 +0200 Subject: netfilter: kill ulog targets This has been marked as deprecated for quite some time and the NFLOG target replacement has been also available since 2006. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 19 -- net/ipv4/netfilter/ipt_ULOG.c | 498 ------------------------------------------ 2 files changed, 517 deletions(-) delete mode 100644 net/ipv4/netfilter/ipt_ULOG.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a26ce035e3fa..730faacbc5f8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -159,25 +159,6 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_ULOG - tristate "ULOG target support (obsolete)" - default m if NETFILTER_ADVANCED=n - ---help--- - - This option enables the old IPv4-only "ipt_ULOG" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds a `ULOG' target, which allows you to create rules in - any iptables table. The packet is passed to a userspace logging - daemon using netlink multicast sockets; unlike the LOG target - which can only be viewed through syslog. - - The appropriate userspace logging daemon (ulogd) may be obtained from - - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack config NF_NAT_IPV4 tristate "IPv4 NAT" diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c deleted file mode 100644 index 9cb993cd224b..000000000000 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * netfilter module for userspace packet logging daemons - * - * (C) 2000-2004 by Harald Welte - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2005-2007 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since - * nlbufsiz is used with alloc_skb, which adds another - * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); - -#define ULOG_NL_EVENT 111 /* Harald's favorite number */ -#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0400); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); - -static bool nflog = true; -module_param(nflog, bool, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - -/* global data structures */ - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ -} ulog_buff_t; - -static int ulog_net_id __read_mostly; -struct ulog_net { - unsigned int nlgroup[ULOG_MAXNLGROUPS]; - ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; - struct sock *nflognl; - spinlock_t lock; -}; - -static struct ulog_net *ulog_pernet(struct net *net) -{ - return net_generic(net, ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) -{ - ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - - pr_debug("ulog_send: timer is deleting\n"); - del_timer(&ub->timer); - - if (!ub->skb) { - pr_debug("ulog_send: nothing to send\n"); - return; - } - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; - pr_debug("throwing %d packets to netlink group %u\n", - ub->qlen, nlgroupnum + 1); - netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, - GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; - ub->lastnlh = NULL; -} - - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - unsigned int groupnum = *((unsigned int *)data); - struct ulog_net *ulog = container_of((void *)data, - struct ulog_net, - nlgroup[groupnum]); - pr_debug("timer function called, calling ulog_send\n"); - - /* lock to protect against somebody modifying our structure - * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog->lock); - ulog_send(ulog, groupnum); - spin_unlock_bh(&ulog->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - /* alloc skb which should be big enough for a whole - * multipart message. WARNING: has to be <= 131000 - * due to slab allocator restrictions */ - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate %ub\n", size); - } - } - - return skb; -} - -static void ipt_ulog_packet(struct net *net, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ipt_ulog_info *loginfo, - const char *prefix) -{ - ulog_buff_t *ub; - ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct timeval tv; - struct ulog_net *ulog = ulog_pernet(net); - - /* ffs == find first bit set, necessary because userspace - * is already shifting groupnumber, but we need unshifted. - * ffs() returns [1..32], we need [0..31] */ - unsigned int groupnum = ffs(loginfo->nl_group) - 1; - - /* calculate the size of the skb needed */ - if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len) - copy_len = skb->len; - else - copy_len = loginfo->copy_range; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - - ub = &ulog->ulog_buffers[groupnum]; - - spin_lock_bh(&ulog->lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } else if (ub->qlen >= loginfo->qthreshold || - size > skb_tailroom(ub->skb)) { - /* either the queue len is too high or we don't have - * enough room in nlskb left. send it to userspace. */ - - ulog_send(ulog, groupnum); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } - - pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len, 0); - if (!nlh) { - pr_debug("error during nlmsg_put\n"); - goto out_unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* We might not have a timestamp, get one */ - if (skb->tstamp.tv64 == 0) - __net_timestamp((struct sk_buff *)skb); - - /* copy hook, prefix, timestamp, payload, etc. */ - pm->data_len = copy_len; - tv = ktime_to_timeval(skb->tstamp); - put_unaligned(tv.tv_sec, &pm->timestamp_sec); - put_unaligned(tv.tv_usec, &pm->timestamp_usec); - put_unaligned(skb->mark, &pm->mark); - pm->hook = hooknum; - if (prefix != NULL) { - strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); - pm->prefix[sizeof(pm->prefix) - 1] = '\0'; - } - else if (loginfo->prefix[0] != '\0') - strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - - if (in && in->hard_header_len > 0 && - skb->mac_header != skb->network_header && - in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); - pm->mac_len = in->hard_header_len; - } else - pm->mac_len = 0; - - if (in) - strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - - if (out) - strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - - /* copy_len <= skb->len, so can't fail. */ - if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) - BUG(); - - /* check if we are building multi-part messages */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - /* if timer isn't already running, start it */ - if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - - /* if threshold is reached, send message to userspace */ - if (ub->qlen >= loginfo->qthreshold) { - if (loginfo->qthreshold > 1) - nlh->nlmsg_type = NLMSG_DONE; - ulog_send(ulog, groupnum); - } -out_unlock: - spin_unlock_bh(&ulog->lock); - - return; - -alloc_failure: - pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog->lock); -} - -static unsigned int -ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return XT_CONTINUE; -} - -static void ipt_logfn(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li, - const char *prefix) -{ - struct ipt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nl_group = ULOG_DEFAULT_NLGROUP; - loginfo.copy_range = 0; - loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nl_group = li->u.ulog.group; - loginfo.copy_range = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static int ulog_tg_check(const struct xt_tgchk_param *par) -{ - const struct ipt_ulog_info *loginfo = par->targinfo; - - if (!par->net->xt.ulog_warn_deprecated) { - pr_info("ULOG is deprecated and it will be removed soon, " - "use NFLOG instead\n"); - par->net->xt.ulog_warn_deprecated = true; - } - - if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { - pr_debug("prefix not null-terminated\n"); - return -EINVAL; - } - if (loginfo->qthreshold > ULOG_MAX_QLEN) { - pr_debug("queue threshold %Zu > MAX_QLEN\n", - loginfo->qthreshold); - return -EINVAL; - } - return 0; -} - -#ifdef CONFIG_COMPAT -struct compat_ipt_ulog_info { - compat_uint_t nl_group; - compat_size_t copy_range; - compat_size_t qthreshold; - char prefix[ULOG_PREFIX_LEN]; -}; - -static void ulog_tg_compat_from_user(void *dst, const void *src) -{ - const struct compat_ipt_ulog_info *cl = src; - struct ipt_ulog_info l = { - .nl_group = cl->nl_group, - .copy_range = cl->copy_range, - .qthreshold = cl->qthreshold, - }; - - memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); - memcpy(dst, &l, sizeof(l)); -} - -static int ulog_tg_compat_to_user(void __user *dst, const void *src) -{ - const struct ipt_ulog_info *l = src; - struct compat_ipt_ulog_info cl = { - .nl_group = l->nl_group, - .copy_range = l->copy_range, - .qthreshold = l->qthreshold, - }; - - memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); - return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target ulog_tg_reg __read_mostly = { - .name = "ULOG", - .family = NFPROTO_IPV4, - .target = ulog_tg, - .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ulog_tg_check, -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = ulog_tg_compat_from_user, - .compat_to_user = ulog_tg_compat_to_user, -#endif - .me = THIS_MODULE, -}; - -static struct nf_logger ipt_ulog_logger __read_mostly = { - .name = "ipt_ULOG", - .logfn = ipt_logfn, - .me = THIS_MODULE, -}; - -static int __net_init ulog_tg_net_init(struct net *net) -{ - int i; - struct ulog_net *ulog = ulog_pernet(net); - struct netlink_kernel_cfg cfg = { - .groups = ULOG_MAXNLGROUPS, - }; - - spin_lock_init(&ulog->lock); - /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ulog->nlgroup[i] = i; - setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ulog->nlgroup[i]); - } - - ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ulog->nflognl) - return -ENOMEM; - - if (nflog) - nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; -} - -static void __net_exit ulog_tg_net_exit(struct net *net) -{ - ulog_buff_t *ub; - int i; - struct ulog_net *ulog = ulog_pernet(net); - - if (nflog) - nf_log_unset(net, &ipt_ulog_logger); - - netlink_kernel_release(ulog->nflognl); - - /* remove pending timers and free allocated skb's */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog->ulog_buffers[i]; - pr_debug("timer is deleting\n"); - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } -} - -static struct pernet_operations ulog_tg_net_ops = { - .init = ulog_tg_net_init, - .exit = ulog_tg_net_exit, - .id = &ulog_net_id, - .size = sizeof(struct ulog_net), -}; - -static int __init ulog_tg_init(void) -{ - int ret; - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ulog_tg_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) - goto out_target; - - if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ulog_tg_net_ops); -out_pernet: - return ret; -} - -static void __exit ulog_tg_exit(void) -{ - pr_debug("cleanup_module\n"); - if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - unregister_pernet_subsys(&ulog_tg_net_ops); -} - -module_init(ulog_tg_init); -module_exit(ulog_tg_exit); -- cgit From 1990e4f883a6383b04ef37f32bdaa02dc0dbae8d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 9 May 2014 23:43:42 +0200 Subject: vti: Simplify error handling in module init and exit The error handling in the module init and exit functions can be shortened to safe us some code. 1/ Remove the code duplications in the init function, jump straight to the existing cleanup code by adding some labels. Also give the error message some more value by telling the reason why loading the module has failed. Furthermore fix the "IPSec" typo -- it should be "IPsec" instead. 2/ Remove the error handling in the exit function as the only legitimate reason xfrm4_protocol_deregister() might fail is inet_del_protocol() returning -1. That, in turn, means some other protocol handler had been registered for this very protocol in the meantime. But that essentially means we haven't been handling that protocol any more, anyway. What it definitely means not is that we "can't deregister tunnel". Therefore just get rid of that bogus warning. It's plain wrong. Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 54 +++++++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b8960f3527f3..e453cb724a95 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { static int __init vti_init(void) { + const char *msg; int err; - pr_info("IPv4 over IPSec tunneling driver\n"); + pr_info("IPv4 over IPsec tunneling driver\n"); + msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) - return err; - err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); - if (err < 0) { - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } + goto pernet_dev_failed; + msg = "tunnel protocols"; + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); - if (err < 0) { - xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } - + if (err < 0) + goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); - if (err < 0) { - xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); - xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } + if (err < 0) + goto xfrm_proto_comp_failed; + msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; @@ -576,23 +564,23 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); +pernet_dev_failed: + pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) - pr_info("vti close: can't deregister tunnel\n"); - if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) - pr_info("vti close: can't deregister tunnel\n"); - if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) - pr_info("vti close: can't deregister tunnel\n"); - - + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } -- cgit From 83e96d443b372611adf19e4171d41deb1d8760cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 19 Jun 2014 20:47:14 +0200 Subject: netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files The plain text logging is currently embedded into the xt_LOG target. In order to be able to use the plain text logging from nft_log, as a first step, this patch moves the family specific code to the following files and Kconfig symbols: 1) net/ipv4/netfilter/nf_log_ip.c: CONFIG_NF_LOG_IPV4 2) net/ipv6/netfilter/nf_log_ip6.c: CONFIG_NF_LOG_IPV6 3) net/netfilter/nf_log_common.c: CONFIG_NF_LOG_COMMON These new modules will be required by xt_LOG and nft_log. This patch is based on original patch from Arturo Borrero Gonzalez. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 5 + net/ipv4/netfilter/Makefile | 3 + net/ipv4/netfilter/nf_log_ipv4.c | 385 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 net/ipv4/netfilter/nf_log_ipv4.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 730faacbc5f8..0c980293d225 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -159,6 +159,11 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + # NAT + specific targets: nf_conntrack config NF_NAT_IPV4 tristate "IPv4 NAT" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 90b82405331e..730e0c1ead90 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,6 +19,9 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o +# logging +obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c new file mode 100644 index 000000000000..7e69a401a29e --- /dev/null +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -0,0 +1,385 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + nf_log_buf_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + nf_log_buf_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + nf_log_buf_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + nf_log_buf_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + nf_log_buf_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + nf_log_buf_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + nf_log_buf_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) { + nf_log_buf_add(m, "MTU=%u ", + ntohs(ich->un.frag.mtu)); + } + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + nf_log_buf_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + nf_log_buf_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + nf_log_buf_add(m, ":%02x", *p); + } + nf_log_buf_add(m, " "); +} + +void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, + out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} +EXPORT_SYMBOL_GPL(nf_log_ip_packet); + +static struct nf_logger nf_ip_logger __read_mostly = { + .name = "nf_log_ipv4", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv4_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __net_exit nf_log_ipv4_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip_logger); +} + +static struct pernet_operations nf_log_ipv4_net_ops = { + .init = nf_log_ipv4_net_init, + .exit = nf_log_ipv4_net_exit, +}; + +static int __init nf_log_ipv4_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv4_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __exit nf_log_ipv4_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + nf_log_unregister(&nf_ip_logger); +} + +module_init(nf_log_ipv4_init); +module_exit(nf_log_ipv4_exit); + +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); -- cgit From fab4085f4e248b8a80bb1dadbbacb2bacd8017c3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jun 2014 19:38:25 +0200 Subject: netfilter: log: nf_log_packet() as real unified interface Before this patch, the nf_loginfo parameter specified the logging configuration in case the specified default logger was loaded. This patch updates the semantics of the nf_loginfo parameter in nf_log_packet() which now indicates the logger that you explicitly want to use. Thus, nf_log_packet() is exposed as an unified interface which internally routes the log message to the corresponding logger type by family. The module dependencies are expressed by the new nf_logger_find_get() and nf_logger_put() functions which bump the logger module refcount. Thus, you can not remove logger modules that are used by rules anymore. Another important effect of this change is that the family specific module is only loaded when required. Therefore, xt_LOG and nft_log will just trigger the autoload of the nf_log_{ip,ip6} modules according to the family. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_log_ipv4.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 7e69a401a29e..078bdca1b607 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -306,12 +306,12 @@ fallback: nf_log_buf_add(m, " "); } -void nf_log_ip_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) +static void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) { struct nf_log_buf *m; @@ -334,7 +334,6 @@ void nf_log_ip_packet(struct net *net, u_int8_t pf, nf_log_buf_close(m); } -EXPORT_SYMBOL_GPL(nf_log_ip_packet); static struct nf_logger nf_ip_logger __read_mostly = { .name = "nf_log_ipv4", @@ -383,3 +382,4 @@ module_exit(nf_log_ipv4_exit); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET, 0); -- cgit From 35b9395104d51f4b85847fa72a1bf4136d36c56e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 20 Jun 2014 20:36:23 +0200 Subject: netfilter: add generic ARP packet logger This adds the generic plain text packet loggger for ARP packets. It is based on the ebt_log code. Nevertheless, the output has been modified to make it consistent with the original xt_LOG output. This is an example output: IN=wlan0 OUT= ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=00:ab:12:34:55:63 IPSRC=192.168.10.1 MACDST=80:09:12:70:4f:50 IPDST=192.168.10.150 This patch enables packet logging from ARP chains, eg. nft add rule arp filter input log prefix "input: " Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 5 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nf_log_arp.c | 149 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 net/ipv4/netfilter/nf_log_arp.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0c980293d225..12e6057daea1 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -159,6 +159,11 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_INET + config NF_LOG_IPV4 tristate "IPv4 packet logging" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 730e0c1ead90..245db9df3337 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o # logging +obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o # NAT helpers (nf_conntrack) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c new file mode 100644 index 000000000000..ccfc78db12ee --- /dev/null +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -0,0 +1,149 @@ +/* + * (C) 2014 by Pablo Neira Ayuso + * + * Based on code from ebt_log from: + * + * Bart De Schuymer + * Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +struct arppayload { + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void dump_arp_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int nhoff) +{ + const struct arphdr *ah; + struct arphdr _arph; + const struct arppayload *ap; + struct arppayload _arpp; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); + + /* If it's for Ethernet and the lengths are OK, then log the ARP + * payload. + */ + if (ah->ar_hrd != htons(1) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != sizeof(__be32)) + return; + + ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); + if (ap == NULL) { + nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", + skb->len - sizeof(_arph)); + return; + } + nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); +} + +void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix); + dump_arp_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_arp_logger __read_mostly = { + .name = "nf_log_arp", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_arp_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_arp_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __net_exit nf_log_arp_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_arp_logger); +} + +static struct pernet_operations nf_log_arp_net_ops = { + .init = nf_log_arp_net_init, + .exit = nf_log_arp_net_exit, +}; + +static int __init nf_log_arp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_arp_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __exit nf_log_arp_exit(void) +{ + unregister_pernet_subsys(&nf_log_arp_net_ops); + nf_log_unregister(&nf_arp_logger); +} + +module_init(nf_log_arp_init); +module_exit(nf_log_arp_exit); + +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("Netfilter ARP packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(3, 0); -- cgit From 57b47553f65e12e2e4f1608168374b0e651de843 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:50 +0300 Subject: tcp: cookie_v4_init_sequence: skb should be const Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c86624b36a62..c0c75688896e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, + __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); -- cgit From aa27fc501850030fb5d1ee705feb836ee6a21f2a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:51 +0300 Subject: tcp: tcp_v[46]_conn_request: fix snt_synack initialization Commit 016818d07 (tcp: TCP Fast Open Server - take SYNACK RTT after completing 3WHS) changes the code to only take a snt_synack timestamp when a SYNACK transmit or retransmit succeeds. This behaviour is later broken by commit 843f4a55e (tcp: use tcp_v4_send_synack on first SYN-ACK), as snt_synack is now updated even if tcp_v4_send_synack fails. Also, commit 3a19ce0ee (tcp: IPv6 support for fastopen server) misses the required IPv6 updates for 016818d07. This patch makes sure that snt_synack is updated only when the SYNACK trasnmit/retransmit succeeds, for both IPv4 and IPv6. Cc: Cardwell Cc: Daniel Lee Cc: Yuchung Cheng Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 180336d47df6..145f6402c560 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1370,7 +1370,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && tcp_try_fastopen(sk, skb, req, &foc, dst); @@ -1380,7 +1379,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (err || want_cookie) goto drop_and_free; - tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_rsk(req)->listener = NULL; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } -- cgit From 16bea70aa7302b6f3bf3502d5a0efb4ea2ce4712 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:53 +0300 Subject: tcp: add init_req method to tcp_request_sock_ops Move the specific IPv4/IPv6 intializations to a new method in tcp_request_sock_ops in preparation for unifying tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 145f6402c560..f86a86b30d20 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1237,6 +1237,17 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) #endif +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ir_loc_addr = ip_hdr(skb)->daddr; + ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(skb); +} + struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), @@ -1247,26 +1258,26 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, -}; #endif + .init_req = tcp_v4_init_req, +}; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tmp_opt; struct request_sock *req; - struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; - __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; bool want_cookie = false, fastopen; struct flowi4 fl4; struct tcp_fastopen_cookie foc = { .len = -1 }; + const struct tcp_request_sock_ops *af_ops; int err; /* Never answer to SYNs send to broadcast or multicast */ @@ -1298,9 +1309,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (!req) goto drop; -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; -#endif + af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -1313,11 +1322,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); - ireq = inet_rsk(req); - ireq->ir_loc_addr = daddr; - ireq->ir_rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(skb); + af_ops->init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; -- cgit From fb7b37a7f3d6f7b7ba05ee526fee96810d5b92a8 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:54 +0300 Subject: tcp: add init_cookie_seq method to tcp_request_sock_ops Move the specific IPv4/IPv6 cookie sequence initialization to a new method in tcp_request_sock_ops in preparation for unifying tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f86a86b30d20..8c69e44c287b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1264,6 +1264,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .calc_md5_hash = tcp_v4_md5_hash_skb, #endif .init_req = tcp_v4_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v4_init_sequence, +#endif }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1331,7 +1334,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) TCP_ECN_create_request(req, skb, sock_net(sk)); if (want_cookie) { - isn = cookie_v4_init_sequence(sk, skb, &req->mss); + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { /* VJ's idea. We save last timestamp seen -- cgit From d94e0417ad8d96d7d96b69335338ad942eaeecf1 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:55 +0300 Subject: tcp: add route_req method to tcp_request_sock_ops Create wrappers with same signature for the IPv4/IPv6 request routing calls and use these wrappers (via route_req method from tcp_request_sock_ops) in tcp_v4_conn_request and tcp_v6_conn_request with the purpose of unifying the two functions in a later patch. We can later drop the wrapper functions and modify inet_csk_route_req and inet6_cks_route_req to use the same signature. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8c69e44c287b..54fbbd8b4fcd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1248,6 +1248,22 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, ireq->opt = tcp_v4_save_options(skb); } +static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); + + if (strict) { + if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) + *strict = true; + else + *strict = false; + } + + return dst; +} + struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), @@ -1267,6 +1283,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v4_init_sequence, #endif + .route_req = tcp_v4_route_req, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1346,11 +1363,13 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && - fl4.daddr == saddr) { - if (!tcp_peer_is_proven(req, dst, true)) { + if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + bool strict; + + dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, + &strict); + if (dst && strict && + !tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1374,8 +1393,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v4_init_sequence(skb); } - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) - goto drop_and_free; + if (!dst) { + dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL); + if (!dst) + goto drop_and_free; + } tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); -- cgit From 936b8bdb53f90840e658904530f9db8d02ac804b Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:57 +0300 Subject: tcp: add init_seq method to tcp_request_sock_ops More work in preparation of unifying tcp_v4_conn_request and tcp_v6_conn_request: indirect the init sequence calls via the tcp_request_sock_ops. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 54fbbd8b4fcd..43478265006a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, @@ -1284,6 +1284,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, + .init_seq = tcp_v4_init_sequence, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1391,7 +1392,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_release; } - isn = tcp_v4_init_sequence(skb); + isn = af_ops->init_seq(skb); } if (!dst) { dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL); -- cgit From d6274bd8d6ea84b7b54cc1c3fde6bcb6143b104f Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:58 +0300 Subject: tcp: add send_synack method to tcp_request_sock_ops Create a new tcp_request_sock_ops method to unify the IPv4/IPv6 signature for tcp_v[46]_send_synack. This allows us to later unify tcp_v4_rtx_synack with tcp_v6_rtx_synack and tcp_v4_conn_request with tcp_v4_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 43478265006a..b5945ac50876 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -814,6 +814,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * socket. */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) @@ -846,7 +847,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) { - int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + int res = af_ops->send_synack(sk, NULL, NULL, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); @@ -1285,6 +1287,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { #endif .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, + .send_synack = tcp_v4_send_synack, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1404,8 +1407,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v4_send_synack(sk, dst, req, - skb_get_queue_mapping(skb), &foc); + err = af_ops->send_synack(sk, dst, NULL, req, + skb_get_queue_mapping(skb), &foc); if (!fastopen) { if (err || want_cookie) goto drop_and_free; -- cgit From 5db92c994982ed826cf38f38d58bd09bc326aef6 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:59 +0300 Subject: tcp: unify tcp_v4_rtx_synack and tcp_v6_rtx_synack Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 14 +------------- net/ipv4/tcp_output.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b5945ac50876..597dd9d75210 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -845,18 +845,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) -{ - const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; - int res = af_ops->send_synack(sk, NULL, NULL, req, 0, NULL); - - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} - /* * IPv4 request_sock destructor. */ @@ -1269,7 +1257,7 @@ static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d92bce0ea24e..f8f2a944a1ce 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3299,3 +3299,18 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } } + +int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +{ + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + struct flowi fl; + int res; + + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } + return res; +} +EXPORT_SYMBOL(tcp_rtx_synack); -- cgit From 2aec4a297b21f3690486bbf8f7d5d29281ba6a48 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:00 +0300 Subject: tcp: add mss_clamp to tcp_request_sock_ops Add mss_clamp member to tcp_request_sock_ops so that we can later unify tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 597dd9d75210..499d440539ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1265,6 +1265,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { }; static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { + .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, @@ -1324,7 +1325,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); -- cgit From 695da14eb0af21129187ed3810e329b21262e45f Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:01 +0300 Subject: tcp: add queue_add_hash to tcp_request_sock_ops Add queue_add_hash member to tcp_request_sock_ops so that we can later unify tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 499d440539ad..845c39de97ab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1277,6 +1277,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, + .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1403,7 +1404,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; tcp_rsk(req)->listener = NULL; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; -- cgit From 1fb6f159fd21c640a28eb65fbd62ce8c9f6a777e Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:02 +0300 Subject: tcp: add tcp_conn_request Create tcp_conn_request and remove most of the code from tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 128 +------------------------------------------- 2 files changed, 150 insertions(+), 126 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5c23756965a..97e48d60c4e8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5877,3 +5877,151 @@ discard: return 0; } EXPORT_SYMBOL(tcp_rcv_state_process); + +static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + if (family == AF_INET) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), + &ireq->ir_rmt_addr, port); + else + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), + &ireq->ir_v6_rmt_addr, port); +} + +int tcp_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct request_sock *req; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + __u32 isn = TCP_SKB_CB(skb)->when; + bool want_cookie = false, fastopen; + struct flowi fl; + struct tcp_fastopen_cookie foc = { .len = -1 }; + int err; + + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } + + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + req = inet_reqsk_alloc(rsk_ops); + if (!req) + goto drop; + + tcp_rsk(req)->af_specific = af_ops; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = af_ops->mss_clamp; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb, sk); + + af_ops->init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(sk)); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + bool strict; + + dst = af_ops->route_req(sk, &fl, req, &strict); + if (dst && strict && + !tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + pr_drop_req(req, ntohs(tcp_hdr(skb)->source), + rsk_ops->family); + goto drop_and_release; + } + + isn = af_ops->init_seq(skb); + } + if (!dst) { + dst = af_ops->route_req(sk, &fl, req, NULL); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return 0; +} +EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 845c39de97ab..5dfebd2f2e38 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1282,137 +1282,13 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = NULL; - __be32 saddr = ip_hdr(skb)->saddr; - __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false, fastopen; - struct flowi4 fl4; - struct tcp_fastopen_cookie foc = { .len = -1 }; - const struct tcp_request_sock_ops *af_ops; - int err; - /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); - if (!want_cookie) - goto drop; - } - - /* Accept backlog is full. If we have already queued enough - * of warm entries in syn queue, drop request. It is better than - * clogging syn queue with openreqs with exponentially increasing - * timeout. - */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet_reqsk_alloc(&tcp_request_sock_ops); - if (!req) - goto drop; - - af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb, sk); - - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; + return tcp_conn_request(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, skb); - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { - bool strict; - - dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, - &strict); - if (dst && strict && - !tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &saddr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } - - isn = af_ops->init_seq(skb); - } - if (!dst) { - dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL); - if (!dst) - goto drop_and_free; - } - - tcp_rsk(req)->snt_isn = isn; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, NULL, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->listener = NULL; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; -- cgit From c1878869c0c8e0def3df5397155f369442ce4e06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 28 Jun 2014 18:39:01 +0200 Subject: netfilter: fix several Kconfig problems in NF_LOG_* warning: (NETFILTER_XT_TARGET_LOG) selects NF_LOG_IPV6 which has unmet direct dependencies (NET && INET && IPV6 && NETFILTER && IP6_NF_IPTABLES && NETFILTER_ADVANCED) warning: (NF_LOG_IPV4 && NF_LOG_IPV6) selects NF_LOG_COMMON which has unmet direct dependencies (NET && INET && NETFILTER && NF_CONNTRACK) Fixes: 83e96d4 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files") Reported-by: Fengguang Wu Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 12e6057daea1..fb173126f03d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" @@ -159,16 +169,6 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. -config NF_LOG_ARP - tristate "ARP packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_INET - -config NF_LOG_IPV4 - tristate "IPv4 packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON - # NAT + specific targets: nf_conntrack config NF_NAT_IPV4 tristate "IPv4 NAT" -- cgit From 1759389e8af46d724220785bf710b7bdbebdfa48 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 28 Jun 2014 14:12:44 +0200 Subject: xfrm4: Remove duplicate semicolon 3328715e6c1fc (xfrm4: Add IPsec protocol multiplexer) adds a duplicate semicolon after the return-statement. Although it has no negative impact, the second semicolon should be removed. Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Christoph Paasch Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index a2ce0101eaac..dccefa9d84cf 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) for_each_protocol_rcu(ah4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) - return ret;; + return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -- cgit From 4135ab8208048a1586385fae1298726d7dacfc26 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sat, 28 Jun 2014 21:20:54 +0300 Subject: tcp: tcp_conn_request: fix build error when IPv6 is disabled Fixes build error introduced by commit 1fb6f159fd21c64 (tcp: add tcp_conn_request): net/ipv4/tcp_input.c: In function 'pr_drop_req': net/ipv4/tcp_input.c:5889:130: error: 'struct sock_common' has no member named 'skc_v6_daddr' Reported-by: kbuild test robot Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 97e48d60c4e8..bb684967f7a7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,9 +5885,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) if (family == AF_INET) LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), &ireq->ir_rmt_addr, port); - else +#if IS_ENABLED(CONFIG_IPV6) + else if (family == AF_INET6) LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), &ireq->ir_v6_rmt_addr, port); +#endif } int tcp_conn_request(struct request_sock_ops *rsk_ops, -- cgit From 24de3d377539e384621c5b8f8f8d8d01852dddc8 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Mon, 30 Jun 2014 09:19:32 +0800 Subject: netfilter: use IS_ENABLED() macro replace: #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) with #if IS_ENABLED(CONFIG_NF_CT_NETLINK) replace: #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) with #if !IS_ENABLED(CONFIG_NF_NAT) replace: #if !defined(CONFIG_NF_CONNTRACK) && !defined(CONFIG_NF_CONNTRACK_MODULE) with #if !IS_ENABLED(CONFIG_NF_CONNTRACK) And add missing: IS_ENABLED(CONFIG_NF_CT_NETLINK) in net/ipv{4,6}/netfilter/nf_nat_l3proto_ipv{4,6}.c Signed-off-by: Duan Jiong Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 ++-- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 4 ++-- net/ipv4/netfilter/nf_defrag_ipv4.c | 8 ++++---- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 4 ++++ net/ipv4/netfilter/nf_nat_proto_gre.c | 2 +- net/ipv4/netfilter/nf_nat_proto_icmp.c | 2 +- 6 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8127dc802865..4ce44c4bc57b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include #include @@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .invert_tuple = ipv4_invert_tuple, .print_tuple = ipv4_print_tuple, .get_l4proto = ipv4_get_l4proto, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_tuple_size = ipv4_nlattr_tuple_size, .nlattr_to_tuple = ipv4_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a338dad41b7d..b91b2641adda 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include #include @@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .error = icmp_error, .destroy = NULL, .me = NULL, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index b8f6381c7d0b..76bd1aef257f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -17,7 +17,7 @@ #include #include #include -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif #include @@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, { u16 zone = NF_CT_DEFAULT_ZONE; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif @@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, inet->nodefrag) return NF_ACCEPT; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index d8b2e14efddc..14f5ccd06337 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, @@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv4_decode_session, #endif diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 690d890111bb..9414923f1e15 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = { .manip_pkt = gre_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = gre_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index eb303471bcf6..4557b4ab8342 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; -- cgit From 9fe516ba3fb29b6f6a752ffd93342fdee500ec01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jun 2014 08:36:16 -0700 Subject: inet: move ipv6only in sock_common When an UDP application switches from AF_INET to AF_INET6 sockets, we have a small performance degradation for IPv4 communications because of extra cache line misses to access ipv6only information. This can also be noticed for TCP listeners, as ipv6_only_sock() is also used from __inet_lookup_listener()->compute_score() This is magnified when SO_REUSEPORT is used. Move ipv6only into struct sock_common so that it is available at no extra cost in lookups. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e68e0d4af6c9..1649988bd1b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = np->flow_label >> 12; - tw->tw_ipv6only = np->ipv6only; + tw->tw_ipv6only = sk->sk_ipv6only; } #endif -- cgit From 86c6a2c75ab97fe31844985169e26aea335432f9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 30 Jun 2014 15:09:49 -0400 Subject: tcp: switch snt_synack back to measuring transmit time of first SYNACK Always store in snt_synack the time at which the server received the first client SYN and attempted to send the first SYNACK. Recent commit aa27fc501 ("tcp: tcp_v[46]_conn_request: fix snt_synack initialization") resolved an inconsistency between IPv4 and IPv6 in the initialization of snt_synack. This commit brings back the idea from 843f4a55e (tcp: use tcp_v4_send_synack on first SYN-ACK), which was going for the original behavior of snt_synack from the commit where it was added in 9ad7c049f0f79 ("tcp: RFC2988bis + taking RTT sample from 3WHS for the passive open side") in v3.1. In addition to being simpler (and probably a tiny bit faster), unconditionally storing the time of the first SYNACK attempt has been useful because it allows calculating a performance metric quantifying how long it took to establish a passive TCP connection. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Octavian Purdila Cc: Jerry Chu Acked-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5dfebd2f2e38..52d0f6a1ec2c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -838,8 +838,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); - if (!tcp_rsk(req)->snt_synack && !err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; } return err; -- cgit From b73c3d0e4f0e1961e15bec18720e48aabebe2109 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:17 -0700 Subject: net: Save TX flow hash in sock and set in skbuf on xmit For a connected socket we can precompute the flow hash for setting in skb->hash on output. This is a performance advantage over calculating the skb->hash for every packet on the connection. The computation is done using the common hash algorithm to be consistent with computations done for packets of the connection in other states where thers is no socket (e.g. time-wait, syn-recv, syn-cookies). This patch adds sk_txhash to the sock structure. inet_set_txhash and ip6_set_txhash functions are added which are called from points in TCP and UDP where socket moves to established state. skb_set_hash_from_sk is a function which sets skb->hash from the sock txhash value. This is called in UDP and TCP transmit path when transmitting within the context of a socket. Tested: ran super_netperf with 200 TCP_RR streams over a vxlan interface (in this case skb_get_hash called on every TX packet to create a UDP source port). Before fix: 95.02% CPU utilization 154/256/505 90/95/99% latencies 1.13042e+06 tps Time in functions: 0.28% skb_flow_dissect 0.21% __skb_get_hash After fix: 94.95% CPU utilization 156/254/485 90/95/99% latencies 1.15447e+06 Neither __skb_get_hash nor skb_flow_dissect appear in perf Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 1 + net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/tcp_output.c | 1 + 3 files changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index a3095fdefbed..90c0e8386116 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; + inet_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 52d0f6a1ec2c..1edc739b9da5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; + inet_set_txhash(sk); + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -1334,6 +1336,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; + inet_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f8f2a944a1ce..bcee13c4627c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; skb->destructor = tcp_wfree; + skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ -- cgit From 4f6ad60cf357e1663d84de052af68dfccd734ec7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 2 Jul 2014 22:22:20 +0200 Subject: ipconfig: add static to local variable ic_dev_xid is only used in ipconfig.c Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: netdev@vger.kernel.org Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b3e86ea7b71b..f0f3f9f77b30 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,7 +143,7 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ -__be32 ic_dev_xid; /* Device under configuration */ +static __be32 ic_dev_xid; /* Device under configuration */ /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; -- cgit From 21621e93f24b5f8de8262ace269a5f692a826bed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 9 Jul 2014 20:35:21 +0200 Subject: ipconfig: move ic_dev_xid under IPCONFIG_BOOTP ic_dev_xid is only used in __init ic_bootp_recv under IPCONFIG_BOOTP and __init ic_dynamic under IPCONFIG_DYNAMIC(which is itself defined with the same IPCONFIG_BOOTP) This patch fixes the following warning when IPCONFIG_BOOTP is not set: >> net/ipv4/ipconfig.c:146:15: warning: 'ic_dev_xid' defined but not used [-Wunused-variable] static __be32 ic_dev_xid; /* Device under configuration */ Reported-by: Fengguang Wu Cc: Fengguang Wu Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: netdev@vger.kernel.org Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f0f3f9f77b30..f02f594d04c5 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ -static __be32 ic_dev_xid; /* Device under configuration */ - /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; @@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; +static __be32 ic_dev_xid; /* Device under configuration */ /* * Initialize DHCP/BOOTP extension fields in the request. -- cgit From 405fd707196d596f59a510d176dedc979c1ae34b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 9 Jul 2014 22:25:18 -0700 Subject: ipconfig: Only bootp paths should reference ic_dev_xid. It is only tested, and declared, in the bootp code. So, in ic_dynamic() guard it's setting with IPCONFIG_BOOTP. Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f02f594d04c5..5bbef4fdcb43 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1217,10 +1217,10 @@ static int __init ic_dynamic(void) get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { +#ifdef IPCONFIG_BOOTP /* Track the device we are configuring */ ic_dev_xid = d->xid; -#ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif -- cgit From a2f983f83b0f66a74a045d36eb84e2f01bb950c7 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 11 Jul 2014 14:32:17 +0800 Subject: ipv4: remove the unnecessary variable in udp_mcast_next Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/udp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d92f94b7e402..ac30e106a884 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -600,19 +600,18 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, int dif) { struct hlist_nulls_node *node; - struct sock *s = sk; unsigned short hnum = ntohs(loc_port); - sk_nulls_for_each_from(s, node) { - if (__udp_is_mcast_sock(net, s, + sk_nulls_for_each_from(sk, node) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, rmt_port, rmt_addr, dif, hnum)) goto found; } - s = NULL; + sk = NULL; found: - return s; + return sk; } /* -- cgit From 8024e02879ddd5042be02c70557f74cdc70b44b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 13 Jul 2014 19:49:37 -0700 Subject: udp: Add udp_sock_create for UDP tunnels to open listener socket Added udp_tunnel.c which can contain some common functions for UDP tunnels. The first function in this is udp_sock_create which is used to open the listener port for a UDP tunnel. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 4 ++ net/ipv4/Makefile | 1 + net/ipv4/udp_tunnel.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 net/ipv4/udp_tunnel.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 05c57f0fcabe..dbc10d84161f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -307,6 +307,10 @@ config NET_IPVTI the notion of a secure tunnel for IPSEC and then use routing protocol on top. +config NET_UDP_TUNNEL + tristate + default n + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f032688d20d3..8ee1cd4053ee 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c new file mode 100644 index 000000000000..61ec1a65207e --- /dev/null +++ b/net/ipv4/udp_tunnel.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + int err = -EINVAL; + struct socket *sock = NULL; + +#if IS_ENABLED(CONFIG_IPV6) + if (cfg->family == AF_INET6) { + struct sockaddr_in6 udp6_addr; + + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr), 0); + } + if (err < 0) + goto error; + + udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); + udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); + } else +#endif + if (cfg->family == AF_INET) { + struct sockaddr_in udp_addr; + + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); + if (err < 0) + goto error; + } + + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; + } else { + return -EPFNOSUPPORT; + } + + + *sockp = sock; + + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL(udp_sock_create); + +MODULE_LICENSE("GPL"); -- cgit From 155e010edbc168f43bb332887c43e8579ee3894a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 13 Jul 2014 19:49:56 -0700 Subject: udp: Move udp_tunnel_segment into udp_offload.c Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/udp.c | 76 -------------------------------------------------- net/ipv4/udp_offload.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 76 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ac30e106a884..f6dfe525584f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2522,79 +2522,3 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; } - -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - __be16 protocol = skb->protocol; - netdev_features_t enc_features; - int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum; - - oldlen = (u16)~skb->len; - - if (unlikely(!pskb_may_pull(skb, tnl_hlen))) - goto out; - - skb->encapsulation = 0; - __skb_pull(skb, tnl_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb_inner_network_offset(skb)); - skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); - - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; - - /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { - skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, - mac_len); - goto out; - } - - outer_hlen = skb_tnl_header_len(skb); - udp_offset = outer_hlen - tnl_hlen; - skb = segs; - do { - struct udphdr *uh; - int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - - skb->mac_len = mac_len; - - skb_push(skb, outer_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, mac_len); - skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; - uh = udp_hdr(skb); - uh->len = htons(len); - - if (need_csum) { - __be32 delta = htonl(oldlen + len); - - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - uh->check = gso_make_checksum(skb, ~uh->check); - - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } - - skb->protocol = protocol; - } while ((skb = skb->next)); -out: - return segs; -} diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 546d2d439dda..4807544d018b 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb) return 0; } +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; + int mac_len = skb->mac_len; + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + __be16 protocol = skb->protocol; + netdev_features_t enc_features; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; + + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) + goto out; + + skb->encapsulation = 0; + __skb_pull(skb, tnl_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = htons(ETH_P_TEB); + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); + goto out; + } + + outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; + skb = segs; + do { + struct udphdr *uh; + int len; + + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + + skb->mac_len = mac_len; + + skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; + uh = udp_hdr(skb); + uh->len = htons(len); + + if (need_csum) { + __be32 delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); + + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } + + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { -- cgit From c835a677331495cf137a7f8a023463afd9f032f8 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:24 +0200 Subject: net: set name_assign_type in alloc_netdev() Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert all users to pass NET_NAME_UNKNOWN. Coccinelle patch: @@ expression sizeof_priv, name, setup, txqs, rxqs, count; @@ ( -alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs) +alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs) | -alloc_netdev_mq(sizeof_priv, name, setup, count) +alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count) | -alloc_netdev(sizeof_priv, name, setup) +alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup) ) v9: move comments here from the wrong commit Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- net/ipv4/ipmr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 54b6731dab55..0157a7af20a8 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -301,7 +301,7 @@ static struct net_device *__ip_tunnel_create(struct net *net, } ASSERT_RTNL(); - dev = alloc_netdev(ops->priv_size, name, ops->setup); + dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 65bcaa789043..c8034587859d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) else sprintf(name, "pimreg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; -- cgit From 5ee2c941b5969eb1b5592f9731b3ee76a784641f Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 14 Jul 2014 16:58:32 +0200 Subject: tcp: Remove unnecessary arg from tcp_enter_cwr and tcp_init_cwnd_reduction Since Yuchung's 9b44190dc11 (tcp: refactor F-RTO), tcp_enter_cwr is always called with set_ssthresh = 1. Thus, we can remove this argument from tcp_enter_cwr. Further, as we remove this one, tcp_init_cwnd_reduction is then always called with set_ssthresh = true, and so we can get rid of this argument as well. Cc: Yuchung Cheng Signed-off-by: Christoph Paasch Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++-------- net/ipv4/tcp_output.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bb684967f7a7..2e16afba182c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2475,7 +2475,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * losses and/or application stalls), do not perform any further cwnd * reductions, but instead slow start up to ssthresh. */ -static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) +static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2485,8 +2485,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; tp->prr_out = 0; - if (set_ssthresh) - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } @@ -2528,14 +2527,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } @@ -2564,7 +2563,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) tp->retrans_stamp = 0; if (flag & FLAG_ECE) - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); @@ -2670,7 +2669,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -3346,7 +3345,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tp->tlp_high_seq = 0; /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bcee13c4627c..306dd5dead7d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -979,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(err <= 0)) return err; - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); return net_xmit_eval(err); } -- cgit From 11878b40ed5c5bc20d6a115bae156a5b90b0fb3e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 14 Jul 2014 17:55:06 -0400 Subject: net-timestamp: SOCK_RAW and PING timestamping Add SO_TIMESTAMPING to sockets of type PF_INET[6]/SOCK_RAW: Add the necessary sock_tx_timestamp calls to the datapath for RAW sockets (ping sockets already had these calls). Fix the IP output path to pass the timestamp flags on the first fragment also for these sockets. The existing code relies on transhdrlen != 0 to indicate a first fragment. For these sockets, that assumption does not hold. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=77221 Tested SOCK_RAW on IPv4 and IPv6, not PING. Signed-off-by: Willem de Bruijn Acked-by: Richard Cochran Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 +++---- net/ipv4/raw.c | 4 ++++ 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d3b6b0e9857..b16556836d66 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -962,10 +962,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else - /* only the initial fragment is - time stamped */ - cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -976,7 +972,10 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + + /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; /* * Find where to start putting bytes. diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2c65160565e1..2054d7136c62 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -365,6 +365,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_fromiovecend((void *)iph, from, 0, length)) @@ -606,6 +608,8 @@ back_from_confirm: &rt, msg->msg_flags); else { + sock_tx_timestamp(sk, &ipc.tx_flags); + if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); -- cgit From 5cf3d46192fccf68b4a4759e4d7346e41c669a76 Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:31 -0400 Subject: udp: Simplify __udp*_lib_mcast_deliver. Switch to using sk_nulls_for_each which shortens the code and makes it easier to update. Signed-off-by: David Held Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 48 ++++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 668af516f094..bbcc33737ef1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct hlist_nulls_node *node; - unsigned short hnum = ntohs(loc_port); - - sk_nulls_for_each_from(sk, node) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) - goto found; - } - sk = NULL; -found: - return sk; -} - /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -1667,23 +1647,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = skb->dev->ifindex; unsigned int i, count = 0; spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - stack[count++] = sk; - sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sk_nulls_for_each(sk, node, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } + stack[count++] = sk; } } /* -- cgit From 2dc41cff7545d55c6294525c811594576f8e119c Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:32 -0400 Subject: udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver. Many multicast sources can have the same port which can result in a very large list when hashing by port only. Hash by address and port instead if this is the case. This makes multicast more similar to unicast. On a 24-core machine receiving from 500 multicast sockets on the same port, before this patch 80% of system CPU was used up by spin locking and only ~25% of packets were successfully delivered. With this patch, all packets are delivered and kernel overhead is ~8% system CPU on spinlocks. Signed-off-by: David Held Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bbcc33737ef1..f31053b90ee0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1619,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -1651,10 +1653,20 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); int dif = skb->dev->ifindex; - unsigned int i, count = 0; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + udp_table.mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { if (__udp_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, @@ -1664,24 +1676,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, count = 0; } stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing chain lock, we must take a reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + /* * do the slow work with no lock held */ if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } -- cgit From 274f482d33a309c87096f2983601ceda2761094e Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Tue, 22 Jul 2014 21:16:51 +0300 Subject: sock: remove skb argument from sk_rcvqueues_full It hasn't been used since commit 0fd7bac(net: relax rcvbuf limits). Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f31053b90ee0..f57c0e4c2326 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1567,7 +1567,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; -- cgit From f5220d63991f3fcb3d19efe8af0c8f75dcf0309b Mon Sep 17 00:00:00 2001 From: Quentin Armitage Date: Wed, 23 Jul 2014 09:58:01 +0100 Subject: ipv4: Make IP_MULTICAST_ALL and IP_MSFILTER work on raw sockets Currently, although IP_MULTICAST_ALL and IP_MSFILTER ioctl calls succeed on raw sockets, there is no code to implement the functionality on received packets; it is only implemented for UDP sockets. The raw(7) man page states: "In addition, all ip(7) IPPROTO_IP socket options valid for datagram sockets are supported", which implies these ioctls should work on raw sockets. To fix this, add a call to ip_mc_sf_allow on raw sockets. This should not break any existing code, since the current position of not calling ip_mc_sf_filter makes it behave as if neither the IP_MULTICAST_ALL nor the IP_MSFILTER ioctl had been called. Adding the call to ip_mc_sf_allow will therefore maintain the current behaviour so long as IP_MULTICAST_ALL and IP_MSFILTER ioctls are not called. Any code that currently is calling IP_MULTICAST_ALL or IP_MSFILTER ioctls on raw sockets presumably is wanting the filter to be applied, although no filtering will currently be occurring. Signed-off-by: Quentin Armitage Signed-off-by: David S. Miller --- net/ipv4/raw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2054d7136c62..739db3100c23 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) while (sk) { delivered = 1; - if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && + ip_mc_sf_allow(sk, iph->daddr, iph->saddr, + skb->dev->ifindex)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ -- cgit From 179542a548f37e1366f7afb7a088c965a2adf7d0 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Fri, 25 Jul 2014 01:48:44 +0530 Subject: igmp: remove exceptional & on function name In this file, function names are otherwise used as pointers without &. A simplified version of the Coccinelle semantic patch that makes this change is as follows: // @r@ identifier f; @@ f(...) { ... } @@ identifier r.f; @@ - &f + f // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index db710b059bab..f10eab462282 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) atomic_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST - setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); + setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); im->unsolicit_count = IGMP_Unsolicited_Report_Count; #endif -- cgit From 5bd3a76f4ba37108101c2ae93172571a70505982 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Fri, 25 Jul 2014 01:47:16 +0530 Subject: netfilter: nf_conntrack: remove exceptional & on function name In this file, function names are otherwise used as pointers without &. A simplified version of the Coccinelle semantic patch that makes this change is as follows: // @r@ identifier f; @@ f(...) { ... } @@ identifier r.f; @@ - &f + f // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 4ce44c4bc57b..a054fe083431 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, .get_optmax = SO_ORIGINAL_DST+1, - .get = &getorigdst, + .get = getorigdst, .owner = THIS_MODULE, }; -- cgit From d4da843e6fad4f278fe82b075d8e394cff05c95c Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Fri, 25 Jul 2014 14:25:31 +0200 Subject: netfilter: kill remnants of ulog targets The ulog targets were recently killed. A few references to the Kconfig macros CONFIG_IP_NF_TARGET_ULOG and CONFIG_BRIDGE_EBT_ULOG were left untouched. Kill these too. Signed-off-by: Paul Bolle Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 245db9df3337..33001621465b 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o -obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o -- cgit From 36c7778218b93d96d88d68f116a711f6a598b72f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:29 +0200 Subject: inet: frag: constify match, hashfn and constructor arguments Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ed32313e307c..586e6aaf9e6e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -112,18 +112,18 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); } -static unsigned int ip4_hashfn(struct inet_frag_queue *q) +static unsigned int ip4_hashfn(const struct inet_frag_queue *q) { - struct ipq *ipq; + const struct ipq *ipq; ipq = container_of(q, struct ipq, q); return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) { - struct ipq *qp; - struct ip4_create_arg *arg = a; + const struct ipq *qp; + const struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && @@ -133,14 +133,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -static void ip4_frag_init(struct inet_frag_queue *q, void *a) +static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); - struct ip4_create_arg *arg = a; + const struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; -- cgit From fb3cfe6e75b9d05c87265e85e67d7caf6e5b44a7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:30 +0200 Subject: inet: frag: remove hash size assumptions from callers hide actual hash size from individual users: The _find function will now fold the given hash value into the required range. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 13 ++++++++++--- net/ipv4/ip_fragment.c | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3b01959bf4bb..930d23870811 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -46,6 +46,12 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); +static unsigned int +inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) +{ + return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); +} + static void inet_frag_secret_rebuild(unsigned long dummy) { struct inet_frags *f = (struct inet_frags *)dummy; @@ -63,7 +69,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy) hb = &f->hash[i]; hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = f->hashfn(q); + unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) { struct inet_frag_bucket *hb_dest; @@ -133,7 +139,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) unsigned int hash; read_lock(&f->lock); - hash = f->hashfn(fq); + hash = inet_frag_hashfn(f, fq); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); @@ -252,7 +258,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, * the rnd seed, so we need to re-calculate the hash * chain. Fortunatelly the qp_in can be used to get one. */ - hash = f->hashfn(qp_in); + hash = inet_frag_hashfn(f, qp_in); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); @@ -326,6 +332,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *q; int depth = 0; + hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 586e6aaf9e6e..b769eb6c83c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -109,7 +109,7 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); + ip4_frags.rnd); } static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -- cgit From 86e93e470cadedda9181a2bd9aee1d9d2e5e9c0f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:31 +0200 Subject: inet: frag: move evictor calls into frag_find function First step to move eviction handling into a work queue. We lose two spots that accounted evicted fragments in MIB counters. Accounting will be restored since the upcoming work-queue evictor invokes the frag queue timer callbacks instead. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 16 +++++++++------- net/ipv4/ip_fragment.c | 15 --------------- 2 files changed, 9 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 930d23870811..535636017534 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -46,6 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); +static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); + static unsigned int inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) { @@ -203,16 +205,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) +static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) { struct inet_frag_queue *q; int work, evicted = 0; - if (!force) { - if (frag_mem_limit(nf) <= nf->high_thresh) - return 0; - } - work = frag_mem_limit(nf) - nf->low_thresh; while (work > 0 || force) { spin_lock(&nf->lru_lock); @@ -242,7 +239,6 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) return evicted; } -EXPORT_SYMBOL(inet_frag_evictor); static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, @@ -296,6 +292,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; + if (frag_mem_limit(nf) > nf->high_thresh) + return NULL; + q = kzalloc(f->qsize, GFP_ATOMIC); if (q == NULL) return NULL; @@ -332,6 +331,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *q; int depth = 0; + if (frag_mem_limit(nf) > nf->high_thresh) + inet_frag_evictor(nf, f, false); + hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b769eb6c83c0..54988672d00d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -177,18 +177,6 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - */ -static void ip_evictor(struct net *net) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); -} - /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -655,9 +643,6 @@ int ip_defrag(struct sk_buff *skb, u32 user) net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - /* Start by cleaning up the memory. */ - ip_evictor(net); - /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; -- cgit From b13d3cbfb8e8a8f53930af67d1ebf05149f32c24 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:32 +0200 Subject: inet: frag: move eviction of queues to work queue When the high_thresh limit is reached we try to toss the 'oldest' incomplete fragment queues until memory limits are below the low_thresh value. This happens in softirq/packet processing context. This has two drawbacks: 1) processors might evict a queue that was about to be completed by another cpu, because they will compete wrt. resource usage and resource reclaim. 2) LRU list maintenance is expensive. But when constantly overloaded, even the 'least recently used' element is recent, so removing 'lru' queue first is not 'fairer' than removing any other fragment queue. This moves eviction out of the fast path: When the low threshold is reached, a work queue is scheduled which then iterates over the table and removes the queues that exceed the memory limits of the namespace. It sets a new flag called INET_FRAG_EVICTED on the evicted queues so the proper counters will get incremented when the queue is forcefully expired. When the high threshold is reached, no more fragment queues are created until we're below the limit again. The LRU list is now unused and will be removed in a followup patch. Joint work with Nikolay Aleksandrov. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 142 +++++++++++++++++++++++++++++++++-------------- net/ipv4/ip_fragment.c | 3 +- 2 files changed, 101 insertions(+), 44 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 535636017534..43315ecb9400 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,9 @@ #include #include +#define INETFRAGS_EVICT_BUCKETS 128 +#define INETFRAGS_EVICT_MAX 512 + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -46,8 +49,6 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); - static unsigned int inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) { @@ -89,10 +90,92 @@ static void inet_frag_secret_rebuild(unsigned long dummy) mod_timer(&f->secret_timer, now + f->secret_interval); } +static bool inet_fragq_should_evict(const struct inet_frag_queue *q) +{ + return q->net->low_thresh == 0 || + frag_mem_limit(q->net) >= q->net->low_thresh; +} + +static unsigned int +inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +{ + struct inet_frag_queue *fq; + struct hlist_node *n; + unsigned int evicted = 0; + HLIST_HEAD(expired); + +evict_again: + spin_lock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &hb->chain, list) { + if (!inet_fragq_should_evict(fq)) + continue; + + if (!del_timer(&fq->timer)) { + /* q expiring right now thus increment its refcount so + * it won't be freed under us and wait until the timer + * has finished executing then destroy it + */ + atomic_inc(&fq->refcnt); + spin_unlock(&hb->chain_lock); + del_timer_sync(&fq->timer); + WARN_ON(atomic_read(&fq->refcnt) != 1); + inet_frag_put(fq, f); + goto evict_again; + } + + /* suppress xmit of (icmp) error packet */ + fq->last_in &= ~INET_FRAG_FIRST_IN; + fq->last_in |= INET_FRAG_EVICTED; + hlist_del(&fq->list); + hlist_add_head(&fq->list, &expired); + ++evicted; + } + + spin_unlock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &expired, list) + f->frag_expire((unsigned long) fq); + + return evicted; +} + +static void inet_frag_worker(struct work_struct *work) +{ + unsigned int budget = INETFRAGS_EVICT_BUCKETS; + unsigned int i, evicted = 0; + struct inet_frags *f; + + f = container_of(work, struct inet_frags, frags_work); + + BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + + read_lock_bh(&f->lock); + + for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + evicted += inet_evict_bucket(f, &f->hash[i]); + i = (i + 1) & (INETFRAGS_HASHSZ - 1); + if (evicted > INETFRAGS_EVICT_MAX) + break; + } + + f->next_bucket = i; + + read_unlock_bh(&f->lock); +} + +static void inet_frag_schedule_worker(struct inet_frags *f) +{ + if (unlikely(!work_pending(&f->frags_work))) + schedule_work(&f->frags_work); +} + void inet_frags_init(struct inet_frags *f) { int i; + INIT_WORK(&f->frags_work, inet_frag_worker); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i]; @@ -120,16 +203,22 @@ EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { del_timer(&f->secret_timer); + cancel_work_sync(&f->frags_work); } EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { + int i; + nf->low_thresh = 0; - local_bh_disable(); - inet_frag_evictor(nf, f, true); - local_bh_enable(); + read_lock_bh(&f->lock); + + for (i = 0; i < INETFRAGS_HASHSZ ; i++) + inet_evict_bucket(f, &f->hash[i]); + + read_unlock_bh(&f->lock); percpu_counter_destroy(&nf->mem); } @@ -205,41 +294,6 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) -{ - struct inet_frag_queue *q; - int work, evicted = 0; - - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0 || force) { - spin_lock(&nf->lru_lock); - - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } - - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); - - spin_unlock(&nf->lru_lock); - - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); - - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; - } - - return evicted; -} - static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) @@ -292,8 +346,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (frag_mem_limit(nf) > nf->high_thresh) + if (frag_mem_limit(nf) > nf->high_thresh) { + inet_frag_schedule_worker(f); return NULL; + } q = kzalloc(f->qsize, GFP_ATOMIC); if (q == NULL) @@ -331,8 +387,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *q; int depth = 0; - if (frag_mem_limit(nf) > nf->high_thresh) - inet_frag_evictor(nf, f, false); + if (frag_mem_limit(nf) > nf->low_thresh) + inet_frag_schedule_worker(f); hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 54988672d00d..54bd170c5eb4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -195,7 +195,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + if (!(qp->q.last_in & INET_FRAG_EVICTED)) + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { -- cgit From 434d305405ab86414f6ea3f261307d443a2c3506 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:33 +0200 Subject: inet: frag: don't account number of fragment queues The 'nqueues' counter is protected by the lru list lock, once thats removed this needs to be converted to atomic counter. Given this isn't used for anything except for reporting it to userspace via /proc, just remove it. We still report the memory currently used by fragment reassembly queues. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 1 - net/ipv4/ip_fragment.c | 5 ----- net/ipv4/proc.c | 5 +++-- 3 files changed, 3 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 43315ecb9400..231ca0b40811 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -193,7 +193,6 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { - nf->nqueues = 0; init_frag_mem_limit(nf); INIT_LIST_HEAD(&nf->lru_list); spin_lock_init(&nf->lru_lock); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 54bd170c5eb4..1f42c2e3966b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -86,11 +86,6 @@ static inline u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_nqueues(struct net *net) -{ - return net->ipv4.frags.nqueues; -} - int ip_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv4.frags); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae0af9386f7c..8e3eb39f84e7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -52,6 +52,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem; int orphans, sockets; local_bh_disable(); @@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(net), ip_frag_mem(net)); + frag_mem = ip_frag_mem(net); + seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } -- cgit From 3fd588eb90bfbba17091381006ecafe29c45db4a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:34 +0200 Subject: inet: frag: remove lru list no longer used. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 12 ++---------- net/ipv4/ip_fragment.c | 1 - 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 231ca0b40811..198a5ed7a815 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -194,8 +194,6 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { init_frag_mem_limit(nf); - INIT_LIST_HEAD(&nf->lru_list); - spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); @@ -237,7 +235,6 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) spin_unlock(&hb->chain_lock); read_unlock(&f->lock); - inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -261,8 +258,7 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, kfree_skb(skb); } -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, - int *work) +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) { struct sk_buff *fp; struct netns_frags *nf; @@ -282,14 +278,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, fp = xp; } sum = sum_truesize + f->qsize; - if (work) - *work -= sum; sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); kfree(q); - } EXPORT_SYMBOL(inet_frag_destroy); @@ -333,7 +326,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); - inet_frag_lru_add(nf, qp); + spin_unlock(&hb->chain_lock); read_unlock(&f->lock); @@ -361,7 +354,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - INIT_LIST_HEAD(&q->lru_list); return q; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1f42c2e3966b..8fbeee495037 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -489,7 +489,6 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: -- cgit From e3a57d18b06179d68fcf7a0a06ad844493c65e06 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:35 +0200 Subject: inet: frag: remove periodic secret rebuild timer merge functionality into the eviction workqueue. Instead of rebuilding every n seconds, take advantage of the upper hash chain length limit. If we hit it, mark table for rebuild and schedule workqueue. To prevent frequent rebuilds when we're completely overloaded, don't rebuild more than once every 5 seconds. ipfrag_secret_interval sysctl is now obsolete and has been marked as deprecated, it still can be changed so scripts won't be broken but it won't have any effect. A comment is left above each unused secret_timer variable to avoid confusion. Joint work with Nikolay Aleksandrov. Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 43 +++++++++++++++++++++++++++++-------------- net/ipv4/ip_fragment.c | 5 +++-- 2 files changed, 32 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 198a5ed7a815..58d4c38534f6 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -28,6 +28,9 @@ #define INETFRAGS_EVICT_BUCKETS 128 #define INETFRAGS_EVICT_MAX 512 +/* don't rebuild inetfrag table with new secret more often than this */ +#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -55,16 +58,24 @@ inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); } -static void inet_frag_secret_rebuild(unsigned long dummy) +static bool inet_frag_may_rebuild(struct inet_frags *f) +{ + return time_after(jiffies, + f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); +} + +static void inet_frag_secret_rebuild(struct inet_frags *f) { - struct inet_frags *f = (struct inet_frags *)dummy; - unsigned long now = jiffies; int i; /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock(&f->lock); + write_lock_bh(&f->lock); + + if (!inet_frag_may_rebuild(f)) + goto out; get_random_bytes(&f->rnd, sizeof(u32)); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; @@ -85,9 +96,11 @@ static void inet_frag_secret_rebuild(unsigned long dummy) } } } - write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->secret_interval); + f->rebuild = false; + f->last_rebuild_jiffies = jiffies; +out: + write_unlock_bh(&f->lock); } static bool inet_fragq_should_evict(const struct inet_frag_queue *q) @@ -162,6 +175,8 @@ static void inet_frag_worker(struct work_struct *work) f->next_bucket = i; read_unlock_bh(&f->lock); + if (f->rebuild && inet_frag_may_rebuild(f)) + inet_frag_secret_rebuild(f); } static void inet_frag_schedule_worker(struct inet_frags *f) @@ -183,11 +198,7 @@ void inet_frags_init(struct inet_frags *f) INIT_HLIST_HEAD(&hb->chain); } rwlock_init(&f->lock); - - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, - (unsigned long)f); - f->secret_timer.expires = jiffies + f->secret_interval; - add_timer(&f->secret_timer); + f->last_rebuild_jiffies = 0; } EXPORT_SYMBOL(inet_frags_init); @@ -199,7 +210,6 @@ EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { - del_timer(&f->secret_timer); cancel_work_sync(&f->frags_work); } EXPORT_SYMBOL(inet_frags_fini); @@ -399,8 +409,13 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key); - else - return ERR_PTR(-ENOBUFS); + + if (inet_frag_may_rebuild(f)) { + f->rebuild = true; + inet_frag_schedule_worker(f); + } + + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8fbeee495037..44e591a7e03f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -720,10 +720,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", - .data = &ip4_frags.secret_interval, + .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -853,6 +855,5 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } -- cgit From ab1c724f633080ed2e8a0cfe61654599b55cf8f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:36 +0200 Subject: inet: frag: use seqlock for hash rebuild rehash is rare operation, don't force readers to take the read-side rwlock. Instead, we only have to detect the (rare) case where the secret was altered while we are trying to insert a new inetfrag queue into the table. If it was changed, drop the bucket lock and recompute the hash to get the 'new' chain bucket that we have to insert into. Joint work with Nikolay Aleksandrov. Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 88 ++++++++++++++++++++++++++++++------------------ net/ipv4/ip_fragment.c | 1 - 2 files changed, 55 insertions(+), 34 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 58d4c38534f6..62b1f73749dc 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -68,8 +68,7 @@ static void inet_frag_secret_rebuild(struct inet_frags *f) { int i; - /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock_bh(&f->lock); + write_seqlock_bh(&f->rnd_seqlock); if (!inet_frag_may_rebuild(f)) goto out; @@ -82,6 +81,8 @@ static void inet_frag_secret_rebuild(struct inet_frags *f) struct hlist_node *n; hb = &f->hash[i]; + spin_lock(&hb->chain_lock); + hlist_for_each_entry_safe(q, n, &hb->chain, list) { unsigned int hval = inet_frag_hashfn(f, q); @@ -92,15 +93,28 @@ static void inet_frag_secret_rebuild(struct inet_frags *f) /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; + + /* This is the only place where we take + * another chain_lock while already holding + * one. As this will not run concurrently, + * we cannot deadlock on hb_dest lock below, if its + * already locked it will be released soon since + * other caller cannot be waiting for hb lock + * that we've taken above. + */ + spin_lock_nested(&hb_dest->chain_lock, + SINGLE_DEPTH_NESTING); hlist_add_head(&q->list, &hb_dest->chain); + spin_unlock(&hb_dest->chain_lock); } } + spin_unlock(&hb->chain_lock); } f->rebuild = false; f->last_rebuild_jiffies = jiffies; out: - write_unlock_bh(&f->lock); + write_sequnlock_bh(&f->rnd_seqlock); } static bool inet_fragq_should_evict(const struct inet_frag_queue *q) @@ -163,7 +177,7 @@ static void inet_frag_worker(struct work_struct *work) BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - read_lock_bh(&f->lock); + local_bh_disable(); for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); @@ -174,7 +188,8 @@ static void inet_frag_worker(struct work_struct *work) f->next_bucket = i; - read_unlock_bh(&f->lock); + local_bh_enable(); + if (f->rebuild && inet_frag_may_rebuild(f)) inet_frag_secret_rebuild(f); } @@ -197,7 +212,8 @@ void inet_frags_init(struct inet_frags *f) spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } - rwlock_init(&f->lock); + + seqlock_init(&f->rnd_seqlock); f->last_rebuild_jiffies = 0; } EXPORT_SYMBOL(inet_frags_init); @@ -216,35 +232,56 @@ EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { + unsigned int seq; int i; nf->low_thresh = 0; + local_bh_disable(); - read_lock_bh(&f->lock); +evict_again: + seq = read_seqbegin(&f->rnd_seqlock); for (i = 0; i < INETFRAGS_HASHSZ ; i++) inet_evict_bucket(f, &f->hash[i]); - read_unlock_bh(&f->lock); + if (read_seqretry(&f->rnd_seqlock, seq)) + goto evict_again; + + local_bh_enable(); percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static struct inet_frag_bucket * +get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) +__acquires(hb->chain_lock) { struct inet_frag_bucket *hb; - unsigned int hash; + unsigned int seq, hash; + + restart: + seq = read_seqbegin(&f->rnd_seqlock); - read_lock(&f->lock); hash = inet_frag_hashfn(f, fq); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); + if (read_seqretry(&f->rnd_seqlock, seq)) { + spin_unlock(&hb->chain_lock); + goto restart; + } + + return hb; +} + +static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +{ + struct inet_frag_bucket *hb; + + hb = get_frag_bucket_locked(fq, f); hlist_del(&fq->list); spin_unlock(&hb->chain_lock); - - read_unlock(&f->lock); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -300,30 +337,18 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { - struct inet_frag_bucket *hb; + struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; - unsigned int hash; - - read_lock(&f->lock); /* Protects against hash rebuild */ - /* - * While we stayed w/o the lock other CPU could update - * the rnd seed, so we need to re-calculate the hash - * chain. Fortunatelly the qp_in can be used to get one. - */ - hash = inet_frag_hashfn(f, qp_in); - hb = &f->hash[hash]; - spin_lock(&hb->chain_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * released the hash bucket lock. + * such entry could have been created on other cpu before + * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; @@ -338,7 +363,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, hlist_add_head(&qp->list, &hb->chain); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return qp; } @@ -382,7 +406,6 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; @@ -399,19 +422,18 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return q; } depth++; } spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key); if (inet_frag_may_rebuild(f)) { - f->rebuild = true; + if (!f->rebuild) + f->rebuild = true; inet_frag_schedule_worker(f); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 44e591a7e03f..ccee68dffd6e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -244,7 +244,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; - read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); -- cgit From 1bab4c75075b84675b96992ac47580a57c26958d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 24 Jul 2014 16:50:37 +0200 Subject: inet: frag: set limits and make init_net's high_thresh limit global This patch makes init_net's high_thresh limit to be the maximum for all namespaces, thus introducing a global memory limit threshold equal to the sum of the individual high_thresh limits which are capped. It also introduces some sane minimums for low_thresh as it shouldn't be able to drop below 0 (or > high_thresh in the unsigned case), and overall low_thresh should not ever be above high_thresh, so we make the following relations for a namespace: init_net: high_thresh - max(not capped), min(init_net low_thresh) low_thresh - max(init_net high_thresh), min (0) all other namespaces: high_thresh = max(init_net high_thresh), min(namespace's low_thresh) low_thresh = max(namespace's high_thresh), min(0) The major issue with having low_thresh > high_thresh is that we'll schedule eviction but never evict anything and thus rely only on the timers. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ccee68dffd6e..634fc31aa243 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -700,14 +700,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", @@ -752,7 +755,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; + table[0].extra1 = &net->ipv4.frags.low_thresh; + table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; + table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; /* Don't export sysctls to unprivileged users */ -- cgit From 20e61da7ffcfd84a1b6f797e745608572e5bc218 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 25 Jul 2014 15:25:08 -0700 Subject: ipv4: fail early when creating netdev named all or default We create a proc dir for each network device, this will cause conflicts when the devices have name "all" or "default". Rather than emitting an ugly kernel warning, we could just fail earlier by checking the device name. Reported-by: Stephane Chazelas Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e9449376b58e..214882e7d6de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL -static void devinet_sysctl_register(struct in_device *idev); +static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else -static void devinet_sysctl_register(struct in_device *idev) +static int devinet_sysctl_register(struct in_device *idev) { + return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { @@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; + int err = -ENOMEM; ASSERT_RTNL(); @@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev) /* Account for reference dev->ip_ptr (below) */ in_dev_hold(in_dev); - devinet_sysctl_register(in_dev); + err = devinet_sysctl_register(in_dev); + if (err) { + in_dev->dead = 1; + in_dev_put(in_dev); + in_dev = NULL; + goto out; + } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); @@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev) /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: - return in_dev; + return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; @@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); - if (!in_dev) - return notifier_from_errno(-ENOMEM); + if (IS_ERR(in_dev)) + return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); @@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) kfree(t); } -static void devinet_sysctl_register(struct in_device *idev) +static int devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); - __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, + int err; + + if (!sysctl_dev_name_is_allowed(idev->dev->name)) + return -EINVAL; + + err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); + if (err) + return err; + err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, &idev->cnf); + if (err) + neigh_sysctl_unregister(idev->arp_parms); + return err; } static void devinet_sysctl_unregister(struct in_device *idev) -- cgit From 5a8dbf03ddf6dfe465b9f062e5b969f9606643f5 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Sun, 27 Jul 2014 12:36:51 +0530 Subject: net/ipv4: Use IS_ERR_OR_NULL This patch introduces the use of the macro IS_ERR_OR_NULL in place of tests for NULL and IS_ERR. The following Coccinelle semantic patch was used for making the change: @@ expression e; @@ - e == NULL || IS_ERR(e) + IS_ERR_OR_NULL(e) || ... Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index f0bdd47bbbcb..6556263c8fa5 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -74,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { + if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); goto out; } -- cgit From 27446442a810f29d0fa97356bbc11f45e7ecfa6e Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Sun, 27 Jul 2014 12:38:38 +0530 Subject: net/udp_offload: Use IS_ERR_OR_NULL This patch introduces the use of the macro IS_ERR_OR_NULL in place of tests for NULL and IS_ERR. The following Coccinelle semantic patch was used for making the change: @@ expression e; @@ - e == NULL || IS_ERR(e) + IS_ERR_OR_NULL(e) || ... Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4807544d018b..59035bc3008d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -79,7 +79,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { + if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; -- cgit From c54a5e024777a2fd4b01239d58cf119901e8e5ed Mon Sep 17 00:00:00 2001 From: Karoly Kemeny Date: Sun, 27 Jul 2014 12:29:07 +0200 Subject: ipv4: clean up cast warning in do_ip_getsockopt Sparse warns because of implicit pointer cast. v2: subject line correction, space between "void" and "*" Signed-off-by: Karoly Kemeny Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64741b938632..5cb830c78990 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1319,7 +1319,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = optval; + msg.msg_control = (__force void *) optval; msg.msg_controllen = len; msg.msg_flags = flags; -- cgit From 95cb5745983c222867cc9ac593aebb2ad67d72c0 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Tue, 29 Jul 2014 03:07:52 +0400 Subject: ip_tunnel(ipv4): fix tunnels with "local any remote $remote_ip" Ipv4 tunnels created with "local any remote $ip" didn't work properly since 7d442fab0 (ipv4: Cache dst in tunnels). 99% of packets sent via those tunnels had src addr = 0.0.0.0. That was because only dst_entry was cached, although fl4.saddr has to be cached too. Every time ip_tunnel_xmit used cached dst_entry (tunnel_rtable_get returned non-NULL), fl4.saddr was initialized with tnl_params->saddr (= 0 in our case), and wasn't changed until iptunnel_xmit(). This patch adds saddr to ip_tunnel->dst_cache, fixing this issue. Reported-by: Sergey Popov Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 6f9de61dce5f..45920d928341 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -69,23 +69,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) } static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst) + struct dst_entry *dst, __be32 saddr) { struct dst_entry *old_dst; dst_clone(dst); old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); dst_release(old_dst); + idst->saddr = saddr; } -static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +static void tunnel_dst_set(struct ip_tunnel *t, + struct dst_entry *dst, __be32 saddr) { - __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); + __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr); } static void tunnel_dst_reset(struct ip_tunnel *t) { - tunnel_dst_set(t, NULL); + tunnel_dst_set(t, NULL, 0); } void ip_tunnel_dst_reset_all(struct ip_tunnel *t) @@ -93,20 +95,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t) int i; for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); } EXPORT_SYMBOL(ip_tunnel_dst_reset_all); -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, + u32 cookie, __be32 *saddr) { + struct ip_tunnel_dst *idst; struct dst_entry *dst; rcu_read_lock(); - dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + idst = this_cpu_ptr(t->dst_cache); + dst = rcu_dereference(idst->dst); if (dst && !atomic_inc_not_zero(&dst->__refcnt)) dst = NULL; if (dst) { - if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (!dst->obsolete || dst->ops->check(dst, cookie)) { + *saddr = idst->saddr; + } else { tunnel_dst_reset(t); dst_release(dst); dst = NULL; @@ -367,7 +374,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst); + tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -610,7 +617,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); - rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -620,7 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst); + tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { -- cgit From 45a07695bc64b3ab5d6d2215f9677e5b8c05a7d0 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 29 Jul 2014 12:07:27 +0200 Subject: tcp: Fix integer-overflows in TCP veno In veno we do a multiplication of the cwnd and the rtt. This may overflow and thus their result is stored in a u64. However, we first need to cast the cwnd so that actually 64-bit arithmetic is done. A first attempt at fixing 76f1017757aa0 ([TCP]: TCP Veno congestion control) was made by 159131149c2 (tcp: Overflow bug in Vegas), but it failed to add the required cast in tcp_veno_cong_avoid(). Fixes: 76f1017757aa0 ([TCP]: TCP Veno congestion control) Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_veno.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 27b9825753d1..8276977d2c85 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) rtt = veno->minrtt; - target_cwnd = (tp->snd_cwnd * veno->basertt); + target_cwnd = (u64)tp->snd_cwnd * veno->basertt; target_cwnd <<= V_PARAM_SHIFT; do_div(target_cwnd, rtt); -- cgit From 1f74e613ded11517db90b2bd57e9464d9e0fb161 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 29 Jul 2014 13:40:57 +0200 Subject: tcp: Fix integer-overflow in TCP vegas In vegas we do a multiplication of the cwnd and the rtt. This may overflow and thus their result is stored in a u64. However, we first need to cast the cwnd so that actually 64-bit arithmetic is done. Then, we need to do do_div to allow this to be used on 32-bit arches. Cc: Stephen Hemminger Cc: Neal Cardwell Cc: Eric Dumazet Cc: David Laight Cc: Doug Leith Fixes: 8d3a564da34e (tcp: tcp_vegas cong avoid fix) Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9a5e05f27f4f..b40ad897f945 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -218,7 +218,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) * This is: * (actual rate in segments) * baseRTT */ - target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt; + target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT; + do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity -- cgit From 388070faa1e1f6138c1ea0e82271c7ecb3f36477 Mon Sep 17 00:00:00 2001 From: "Banerjee, Debabrata" Date: Wed, 30 Jul 2014 13:50:17 -0400 Subject: tcp: don't require root to read tcp_metrics commit d23ff7016 (tcp: add generic netlink support for tcp_metrics) introduced netlink support for the new tcp_metrics, however it restricted getting of tcp_metrics to root user only. This is a change from how these values could have been fetched when in the old route cache. Unless there's a legitimate reason to restrict the reading of these values it would be better if normal users could fetch them. Cc: Julian Anastasov Cc: linux-kernel@vger.kernel.org Signed-off-by: Debabrata Banerjee Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4fe041805989..0d54e59b9ea8 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1093,7 +1093,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, .policy = tcp_metrics_nl_policy, - .flags = GENL_ADMIN_PERM, }, { .cmd = TCP_METRICS_CMD_DEL, -- cgit From 7304fe4681634a8e0511a5922c972aa132ffb43d Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Thu, 31 Jul 2014 17:54:32 +0800 Subject: net: fix the counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS When dealing with ICMPv[46] Error Message, function icmp_socket_deliver() and icmpv6_notify() do some valid checks on packet's length, but then some protocols check packet's length redaudantly. So remove those duplicated statements, and increase counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS in function icmp_socket_deliver() and icmpv6_notify() respectively. In addition, add missed counter in udp6/udplite6 when socket is NULL. Signed-off-by: Duan Jiong Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 +++- net/ipv4/tcp_ipv4.c | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 42b7bcf8045b..092400ef88d0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -663,8 +663,10 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); return; + } raw_icmp_error(skb, protocol, info); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1edc739b9da5..d0ba39537d5c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -344,11 +344,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - if (icmp_skb->len < (iph->ihl << 2) + 8) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - return; - } - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { -- cgit From 4330487acfff0cf1d7b14d238583a182e0a444bb Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Fri, 1 Aug 2014 09:52:58 +0800 Subject: net: use inet6_iif instead of IP6CB()->iif Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/ping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 044a0ddf6a79..a3c59a077a5f 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); *addr_len = sizeof(*sin6); } -- cgit From 188b1210f3d633e27dc97309c97315ce484122ec Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Fri, 1 Aug 2014 14:00:36 +0800 Subject: ipv4: remove nested rcu_read_lock/unlock ip_local_deliver_finish() already have a rcu_read_lock/unlock, so the rcu_read_lock/unlock is unnecessary. See the stack below: ip_local_deliver_finish | | ->icmp_rcv | | ->icmp_socket_deliver Suggested-by: Hannes Frederic Sowa Signed-off-by: Duan Jiong Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 092400ef88d0..ea7d4afe8205 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -670,11 +670,9 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) raw_icmp_error(skb, protocol, info); - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); - rcu_read_unlock(); } static bool icmp_tag_validation(int proto) -- cgit From 06aa8b8a0345c78f4d9a1fb3f852952b12a0e40c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:44 +0200 Subject: inet: frags: rename last_in to flags The last_in field has been used to store various flags different from first/last frag in so give it a more descriptive name: flags. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 14 +++++++------- net/ipv4/ip_fragment.c | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 62b1f73749dc..e3ebc6608e5d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -152,8 +152,8 @@ evict_again: } /* suppress xmit of (icmp) error packet */ - fq->last_in &= ~INET_FRAG_FIRST_IN; - fq->last_in |= INET_FRAG_EVICTED; + fq->flags &= ~INET_FRAG_FIRST_IN; + fq->flags |= INET_FRAG_EVICTED; hlist_del(&fq->list); hlist_add_head(&fq->list, &expired); ++evicted; @@ -289,16 +289,16 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) if (del_timer(&fq->timer)) atomic_dec(&fq->refcnt); - if (!(fq->last_in & INET_FRAG_COMPLETE)) { + if (!(fq->flags & INET_FRAG_COMPLETE)) { fq_unlink(fq, f); atomic_dec(&fq->refcnt); - fq->last_in |= INET_FRAG_COMPLETE; + fq->flags |= INET_FRAG_COMPLETE; } } EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb) + struct sk_buff *skb) { if (f->skb_free) f->skb_free(skb); @@ -311,7 +311,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) struct netns_frags *nf; unsigned int sum, sum_truesize = 0; - WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); + WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ @@ -349,7 +349,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); - qp_in->last_in |= INET_FRAG_COMPLETE; + qp_in->flags |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 634fc31aa243..6fce1ecc5bca 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -185,16 +185,16 @@ static void ip_expire(unsigned long arg) spin_lock(&qp->q.lock); - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto out; ipq_kill(qp); - if (!(qp->q.last_in & INET_FRAG_EVICTED)) + if (!(qp->q.flags & INET_FRAG_EVICTED)) IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { + if ((qp->q.flags & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; @@ -302,7 +302,7 @@ static int ip_frag_reinit(struct ipq *qp) } while (fp); sub_frag_mem_limit(&qp->q, sum_truesize); - qp->q.last_in = 0; + qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; @@ -323,7 +323,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int err = -ENOENT; u8 ecn; - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto err; if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && @@ -350,9 +350,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * or have different end, the segment is corrupted. */ if (end < qp->q.len || - ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) + ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; - qp->q.last_in |= INET_FRAG_LAST_IN; + qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { @@ -362,7 +362,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ - if (qp->q.last_in & INET_FRAG_LAST_IN) + if (qp->q.flags & INET_FRAG_LAST_IN) goto err; qp->q.len = end; } @@ -471,13 +471,13 @@ found: qp->ecn |= ecn; add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) - qp->q.last_in |= INET_FRAG_FIRST_IN; + qp->q.flags |= INET_FRAG_FIRST_IN; if (ip_hdr(skb)->frag_off & htons(IP_DF) && skb->len + ihl > qp->q.max_size) qp->q.max_size = skb->len + ihl; - if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; -- cgit From f926e23660d52601089222cb4755aabc693ca390 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:46 +0200 Subject: inet: frags: fix function declaration alignments in inet_fragment Fix a couple of functions' declaration alignments. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e3ebc6608e5d..fa49916c23a0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -334,8 +334,9 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_destroy); static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, struct inet_frags *f, - void *arg) + struct inet_frag_queue *qp_in, + struct inet_frags *f, + void *arg) { struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; @@ -368,7 +369,8 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, - struct inet_frags *f, void *arg) + struct inet_frags *f, + void *arg) { struct inet_frag_queue *q; @@ -393,7 +395,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) + struct inet_frags *f, + void *arg) { struct inet_frag_queue *q; @@ -405,7 +408,8 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash) + struct inet_frags *f, void *key, + unsigned int hash) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; -- cgit From 2e404f632f44979ddf0ce0808a438249a72d7015 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:47 +0200 Subject: inet: frags: use INET_FRAG_EVICTED to prevent icmp messages Now that we have INET_FRAG_EVICTED we might as well use it to stop sending icmp messages in the "frag_expire" functions instead of stripping INET_FRAG_FIRST_IN from their flags when evicting. Also fix the comment style in ip6_expire_frag_queue(). Signed-off-by: Nikolay Aleksandrov Reviewed-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 2 -- net/ipv4/ip_fragment.c | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index fa49916c23a0..4baa76c60398 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -151,8 +151,6 @@ evict_again: goto evict_again; } - /* suppress xmit of (icmp) error packet */ - fq->flags &= ~INET_FRAG_FIRST_IN; fq->flags |= INET_FRAG_EVICTED; hlist_del(&fq->list); hlist_add_head(&fq->list, &expired); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 6fce1ecc5bca..cb56bcc1eee2 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -189,16 +189,18 @@ static void ip_expire(unsigned long arg) goto out; ipq_kill(qp); - - if (!(qp->q.flags & INET_FRAG_EVICTED)) - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if ((qp->q.flags & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { + if (!(qp->q.flags & INET_FRAG_EVICTED)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) + goto out; + rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) @@ -211,8 +213,7 @@ static void ip_expire(unsigned long arg) if (err) goto out_rcu_unlock; - /* - * Only an end host needs to send an ICMP + /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || @@ -221,7 +222,6 @@ static void ip_expire(unsigned long arg) (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); out_rcu_unlock: -- cgit From d4ad4d22e7ac6b8711b35d7e86eb29f03f8ac153 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:48 +0200 Subject: inet: frags: use kmem_cache for inet_frag_queue Use kmem_cache to allocate/free inet_frag_queue objects since they're all the same size per inet_frags user and are alloced/freed in high volumes thus making it a perfect case for kmem_cache. Signed-off-by: Nikolay Aleksandrov Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 13 ++++++++++--- net/ipv4/ip_fragment.c | 5 ++++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4baa76c60398..9eb89f3f0ee4 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -198,7 +198,7 @@ static void inet_frag_schedule_worker(struct inet_frags *f) schedule_work(&f->frags_work); } -void inet_frags_init(struct inet_frags *f) +int inet_frags_init(struct inet_frags *f) { int i; @@ -213,6 +213,12 @@ void inet_frags_init(struct inet_frags *f) seqlock_init(&f->rnd_seqlock); f->last_rebuild_jiffies = 0; + f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, + NULL); + if (!f->frags_cachep) + return -ENOMEM; + + return 0; } EXPORT_SYMBOL(inet_frags_init); @@ -225,6 +231,7 @@ EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { cancel_work_sync(&f->frags_work); + kmem_cache_destroy(f->frags_cachep); } EXPORT_SYMBOL(inet_frags_fini); @@ -327,7 +334,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) if (f->destructor) f->destructor(q); - kfree(q); + kmem_cache_free(f->frags_cachep, q); } EXPORT_SYMBOL(inet_frag_destroy); @@ -377,7 +384,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, return NULL; } - q = kzalloc(f->qsize, GFP_ATOMIC); + q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (q == NULL) return NULL; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cb56bcc1eee2..15f0e2bad7ad 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -55,6 +55,7 @@ */ static int sysctl_ipfrag_max_dist __read_mostly = 64; +static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb { @@ -860,5 +861,7 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - inet_frags_init(&ip4_frags); + ip4_frags.frags_cache_name = ip_frag_cache_name; + if (inet_frags_init(&ip4_frags)) + panic("IP: failed to allocate ip4_frags cache\n"); } -- cgit From 64a124edcc94011ce459f0b9bdf51e1783146712 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Sun, 3 Aug 2014 22:45:19 +0400 Subject: tcp: md5: remove unneeded check in tcp_v4_parse_md5_keys tcpm_key is an array inside struct tcp_md5sig, there is no need to check it against NULL. Signed-off-by: Dmitry Popov Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d0ba39537d5c..992a1f926009 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1049,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (sin->sin_family != AF_INET) return -EINVAL; - if (!cmd.tcpm_key || !cmd.tcpm_keylen) + if (!cmd.tcpm_keylen) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, AF_INET); -- cgit From 5ae344c949e79b8545a11db149f0a85a6e59e1f3 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 4 Aug 2014 19:12:29 -0400 Subject: tcp: reduce spurious retransmits due to transient SACK reneging This commit reduces spurious retransmits due to apparent SACK reneging by only reacting to SACK reneging that persists for a short delay. When a sequence space hole at snd_una is filled, some TCP receivers send a series of ACKs as they apparently scan their out-of-order queue and cumulatively ACK all the packets that have now been consecutiveyly received. This is essentially misbehavior B in "Misbehaviors in TCP SACK generation" ACM SIGCOMM Computer Communication Review, April 2011, so we suspect that this is from several common OSes (Windows 2000, Windows Server 2003, Windows XP). However, this issue has also been seen in other cases, e.g. the netdev thread "TCP being hoodwinked into spurious retransmissions by lack of timestamps?" from March 2014, where the receiver was thought to be a BSD box. Since snd_una would temporarily be adjacent to a previously SACKed range in these scenarios, this receiver behavior triggered the Linux SACK reneging code path in the sender. This led the sender to clear the SACK scoreboard, enter CA_Loss, and spuriously retransmit (potentially) every packet from the entire write queue at line rate just a few milliseconds before the ACK for each packet arrives at the sender. To avoid such situations, now when a sender sees apparent reneging it does not yet retransmit, but rather adjusts the RTO timer to give the receiver a little time (max(RTT/2, 10ms)) to send us some more ACKs that will restore sanity to the SACK scoreboard. If the reneging persists until this RTO then, as before, we clear the SACK scoreboard and enter CA_Loss. A 10ms delay tolerates a receiver sending such a stream of ACKs at 56Kbit/sec. And to allow for receivers with slower or more congested paths, we wait for at least RTT/2. We validated the resulting max(RTT/2, 10ms) delay formula with a mix of North American and South American Google web server traffic, and found that for ACKs displaying transient reneging: (1) 90% of inter-ACK delays were less than 10ms (2) 99% of inter-ACK delays were less than RTT/2 In tests on Google web servers this commit reduced reneging events by 75%-90% (as measured by the TcpExtTCPSACKReneging counter), without any measurable impact on latency for user HTTP and SPDY requests. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 ++++++++++++++++++----------- net/ipv4/tcp_timer.c | 4 ++-- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7832d941dbcd..6a2984507755 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->sacked_out = 0; } -/* Enter Loss state. If "how" is not zero, forget all SACK information +/* Enter Loss state. If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ -void tcp_enter_loss(struct sock *sk, int how) +void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; bool new_recovery = false; + bool is_reneg; /* is receiver reneging on SACKs? */ /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_reset_reno_sack(tp); tp->undo_marker = tp->snd_una; - if (how) { + + skb = tcp_write_queue_head(sk); + is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; } @@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how) * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * - * Do processing similar to RTO timeout. + * To avoid big spurious retransmission bursts due to transient SACK + * scoreboard oddities that look like reneging, we give the receiver a + * little time (max(RTT/2, 10ms)) to send us some more ACKs that will + * restore sanity to the SACK scoreboard. If the apparent reneging + * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { - struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), + msecs_to_jiffies(10)); - tcp_enter_loss(sk, 1); - icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, TCP_RTO_MAX); + delay, TCP_RTO_MAX); return true; } return false; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 286227abed10..df90cd1ce37f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_write_err(sk); goto out; } - tcp_enter_loss(sk, 0); + tcp_enter_loss(sk); tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); __sk_dst_reset(sk); goto out_reset_timer; @@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), mib_idx); } - tcp_enter_loss(sk, 0); + tcp_enter_loss(sk); if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, -- cgit From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:47 -0400 Subject: net-timestamp: add key to disambiguate concurrent datagrams Datagrams timestamped on transmission can coexist in the kernel stack and be reordered in packet scheduling. When reading looped datagrams from the socket error queue it is not always possible to unique correlate looped data with original send() call (for application level retransmits). Even if possible, it may be expensive and complex, requiring packet inspection. Introduce a data-independent ID mechanism to associate timestamps with send calls. Pass an ID alongside the timestamp in field ee_data of sock_extended_err. The ID is a simple 32 bit unsigned int that is associated with the socket and incremented on each send() call for which software tx timestamp generation is enabled. The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to avoid changing ee_data for existing applications that expect it 0. The counter is reset each time the flag is reenabled. Reenabling does not change the ID of already submitted data. It is possible to receive out of order IDs if the timestamp stream is not quiesced first. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b16556836d66..215af2b155cb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -976,6 +980,8 @@ alloc_new_skb: /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes. -- cgit From 4ed2d765dfaccff5ebdac68e2064b59125033a3b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:49 -0400 Subject: net-timestamp: TCP timestamping TCP timestamping extends SO_TIMESTAMPING to bytestreams. Bytestreams do not have a 1:1 relationship between send() buffers and network packets. The feature interprets a send call on a bytestream as a request for a timestamp for the last byte in that send() buffer. The choice corresponds to a request for a timestamp when all bytes in the buffer have been sent. That assumption depends on in-order kernel transmission. This is the common case. That said, it is possible to construct a traffic shaping tree that would result in reordering. The guarantee is strong, then, but not ironclad. This implementation supports send and sendpages (splice). GSO replaces one large packet with multiple smaller packets. This patch also copies the option into the correct smaller packet. This patch does not yet support timestamping on data in an initial TCP Fast Open SYN, because that takes a very different data path. If ID generation in ee_data is enabled, bytestream timestamps return a byte offset, instead of the packet counter for datagrams. The implementation supports a single timestamp per packet. It silenty replaces requests for previous timestamps. To avoid missing tstamps, flush the tcp queue by disabling Nagle, cork and autocork. Missing tstamps can be detected by offset when the ee_data ID is enabled. Implementation details: - On GSO, the timestamping code can be included in the main loop. I moved it into its own loop to reduce the impact on the common case to a single branch. - To avoid leaking the absolute seqno to userspace, the offset returned in ee_data must always be relative. It is an offset between an skb and sk field. The first is always set (also for GSO & ACK). The second must also never be uninitialized. Only allow the ID option on sockets in the ESTABLISHED state, for which the seqno is available. Never reset it to zero (instead, move it to the current seqno when reenabling the option). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 22 +++++++++++++++++++--- net/ipv4/tcp_offload.c | 18 ++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d2118e5fbc7..744af67a5989 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + sock_tx_timestamp(sk, &shinfo->tx_flags); + if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; +} + /* * Wait for a TCP event. * @@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err) + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; return mask; @@ -959,8 +968,10 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) + if (!(size -= copy)) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -1252,8 +1263,10 @@ new_segment: from += copy; copied += copy; - if ((seglen -= copy) == 0 && iovlen == 0) + if ((seglen -= copy) == 0 && iovlen == 0) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; @@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (unlikely(flags & MSG_ERRQUEUE)) + return ip_recv_error(sk, msg, len, addr_len); + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 55046ecd083e..f597119fc4e7 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,6 +14,21 @@ #include #include +void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq, + unsigned int mss) +{ + while (skb) { + if (ts_seq < (__u64) seq + mss) { + skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tskey = ts_seq; + return; + } + + skb = skb->next; + seq += mss; + } +} + struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) + tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); -- cgit From e1c8a607b28190cd09a271508aa3025d3c2f312e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:50 -0400 Subject: net-timestamp: ACK timestamp for bytestreams Add SOF_TIMESTAMPING_TX_ACK, a request for a tstamp when the last byte in the send() call is acknowledged. It implements the feature for TCP. The timestamp is generated when the TCP socket cumulative ACK is moved beyond the tracked seqno for the first time. The feature ignores SACK and FACK, because those acknowledge the specific byte, but not necessarily the entire contents of the buffer up to that byte. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a2984507755..a3d47af01906 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -74,6 +74,7 @@ #include #include #include +#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -3106,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_stamp = 0; } + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) && + between(skb_shinfo(skb)->tskey, prior_snd_una, + tp->snd_una + 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + if (!fully_acked) break; -- cgit