From d215d10f2d6bd41ce9d11a2707568bbb50d2c32e Mon Sep 17 00:00:00 2001 From: "Peter Pan(潘卫平)" Date: Mon, 16 Jun 2014 21:57:22 +0800 Subject: net: delete duplicate dev_set_rx_mode() call In __dev_open(), it already calls dev_set_rx_mode(). and dev_set_rx_mode() has no effect for a net device which does not have IFF_UP flag set. So the call of dev_set_rx_mode() is duplicate in __dev_change_flags(). Signed-off-by: Weiping Pan Signed-off-by: David S. Miller --- net/core/dev.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 30eedf677913..a04b12f31e18 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5432,13 +5432,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - if (!ret) - dev_set_rx_mode(dev); - } - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; -- cgit From e0f31d8498676fda36289603a054d0d490aa2679 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan <_govind@gmx.com> Date: Mon, 23 Jun 2014 16:07:58 +0530 Subject: flow_keys: Record IP layer protocol in skb_flow_dissect() skb_flow_dissect() dissects only transport header type in ip_proto. It dose not give any information about IPv4 or IPv6. This patch adds new member, n_proto, to struct flow_keys. Which records the IP layer type. i.e IPv4 or IPv6. This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow. Adding new member to flow_keys increases the struct size by around 4 bytes. This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in qdisc_cb_private_validate() So increase data size by 4 Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 107ed12a5323..c2b53c1b21d2 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -175,6 +175,7 @@ ipv6: break; } + flow->n_proto = proto; flow->ip_proto = ip_proto; flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; -- cgit From f6d8cb2eeded7df18b821a321d4cd1cdd1754bf8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Jun 2014 05:32:48 -0700 Subject: inet: reduce TLB pressure for listeners It seems overkill to use vmalloc() for typical listeners with less than 2048 hash buckets. Try kmalloc() and fallback to vmalloc() to reduce TLB pressure. Use kvfree() helper as it is now available. Use ilog2() instead of a loop. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/request_sock.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 467f326126e0..04db318e6218 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt; + struct listen_sock *lopt = NULL; nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size > PAGE_SIZE) + + if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + lopt = kzalloc(lopt_size, GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + if (!lopt) lopt = vzalloc(lopt_size); - else - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (lopt == NULL) + if (!lopt) return -ENOMEM; - for (lopt->max_qlen_log = 3; - (1 << lopt->max_qlen_log) < nr_table_entries; - lopt->max_qlen_log++); - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; + lopt->max_qlen_log = ilog2(nr_table_entries); write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, void __reqsk_queue_destroy(struct request_sock_queue *queue) { - struct listen_sock *lopt; - size_t lopt_size; - - /* - * this is an error recovery path only - * no locking needed and the lopt is not NULL - */ - - lopt = queue->listen_opt; - lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); - - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + /* This is an error recovery path only, no locking needed */ + kvfree(queue->listen_opt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( @@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) { /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - size_t lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); if (lopt->qlen != 0) { unsigned int i; @@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) } WARN_ON(lopt->qlen != 0); - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + kvfree(lopt); } /* -- cgit From 9bf2b8c280b5c02ca8a9e75263bf3ca998fed144 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 26 Jun 2014 15:56:31 +0800 Subject: net: fix some typos in comment In commit 371121057607e3127e19b3fa094330181b5b031e("net: QDISC_STATE_RUNNING dont need atomic bit ops") the __QDISC_STATE_RUNNING is renamed to __QDISC___STATE_RUNNING, but the old names existing in comment are not replaced with the new name completely. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a04b12f31e18..de9774119541 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2738,8 +2738,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + * This permits __QDISC___STATE_RUNNING owner to get the lock more + * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) -- cgit From b0ab2fabb5b91da99c189db02e91ae10bc8355c5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 26 Jun 2014 09:58:25 +0200 Subject: rtnetlink: allow to register ops without ops->setup set So far, it is assumed that ops->setup is filled up. But there might be case that ops might make sense even without ->setup. In that case, forbid to newlink and dellink. This allows to register simple rtnl link ops containing only ->kind. That allows consistent way of passing device kind (either device-kind or slave-kind) to userspace. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/rtnetlink.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index de9774119541..6e2a2cd82321 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7091,7 +7091,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1063996f8317..27acaf7ff6d7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - if (!ops->dellink) + /* The check for setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if (ops->setup && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -1777,7 +1782,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) return -ENODEV; ops = dev->rtnl_link_ops; - if (!ops) + if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); @@ -2038,6 +2043,9 @@ replay: return -EOPNOTSUPP; } + if (!ops->setup) + return -EOPNOTSUPP; + if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); -- cgit From baac167b706600ebe7158acaeb7c489ae9d0bb8b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Jun 2014 13:16:49 +0200 Subject: pktgen: avoid expensive set_current_state() call in loop Avoid calling set_current_state() inside the busy-loop in pktgen_thread_worker(). In case of pkt_dev->delay, then it is still used/enabled in pktgen_xmit() via the spin() call. The set_current_state(TASK_INTERRUPTIBLE) uses a xchg, which implicit is LOCK prefixed. I've measured the asm LOCK operation to take approx 8ns on this E5-2630 CPU. Performance increase corrolate with this measurement. Performance data with CLONE_SKB==100000, rx-usecs=30: (single CPU performance, ixgbe 10Gbit/s, E5-2630) * Prev: 5454050 pps --> 183.35ns (1/5454050*10^9) * Now: 5684009 pps --> 175.93ns (1/5684009*10^9) * Diff: +229959 pps --> -7.42ns Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/pktgen.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fc17a9d309ac..b61f553689b6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3407,10 +3407,10 @@ static int pktgen_thread_worker(void *arg) pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); - set_current_state(TASK_INTERRUPTIBLE); - set_freezable(); + __set_current_state(TASK_RUNNING); + while (!kthread_should_stop()) { pkt_dev = next_to_run(t); @@ -3424,8 +3424,6 @@ static int pktgen_thread_worker(void *arg) continue; } - __set_current_state(TASK_RUNNING); - if (likely(pkt_dev)) { pktgen_xmit(pkt_dev); @@ -3456,9 +3454,8 @@ static int pktgen_thread_worker(void *arg) } try_to_freeze(); - - set_current_state(TASK_INTERRUPTIBLE); } + set_current_state(TASK_INTERRUPTIBLE); pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); -- cgit From 8788370a1d4b1d89efc1aea1b13ea5e5bfe10fde Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 26 Jun 2014 13:16:59 +0200 Subject: pktgen: RCU-ify "if_list" to remove lock in next_to_run() The if_lock()/if_unlock() in next_to_run() adds a significant overhead, because its called for every packet in busy loop of pktgen_thread_worker(). (Thomas Graf originally pointed me at this lock problem). Removing these two "LOCK" operations should in theory save us approx 16ns (8ns x 2), as illustrated below we do save 16ns when removing the locks and introducing RCU protection. Performance data with CLONE_SKB==100000, TX-size=512, rx-usecs=30: (single CPU performance, ixgbe 10Gbit/s, E5-2630) * Prev : 5684009 pps --> 175.93ns (1/5684009*10^9) * RCU-fix: 6272204 pps --> 159.43ns (1/6272204*10^9) * Diff : +588195 pps --> -16.50ns To understand this RCU patch, I describe the pktgen thread model below. In pktgen there is several kernel threads, but there is only one CPU running each kernel thread. Communication with the kernel threads are done through some thread control flags. This allow the thread to change data structures at a know synchronization point, see main thread func pktgen_thread_worker(). Userspace changes are communicated through proc-file writes. There are three types of changes, general control changes "pgctrl" (func:pgctrl_write), thread changes "kpktgend_X" (func:pktgen_thread_write), and interface config changes "etcX@N" (func:pktgen_if_write). Userspace "pgctrl" and "thread" changes are synchronized via the mutex pktgen_thread_lock, thus only a single userspace instance can run. The mutex is taken while the packet generator is running, by pgctrl "start". Thus e.g. "add_device" cannot be invoked when pktgen is running/started. All "pgctrl" and all "thread" changes, except thread "add_device", communicate via the thread control flags. The main problem is the exception "add_device", that modifies threads "if_list" directly. Fortunately "add_device" cannot be invoked while pktgen is running. But there exists a race between "rem_device_all" and "add_device" (which normally don't occur, because "rem_device_all" waits 125ms before returning). Background'ing "rem_device_all" and running "add_device" immediately allow the race to occur. The race affects the threads (list of devices) "if_list". The if_lock is used for protecting this "if_list". Other readers are given lock-free access to the list under RCU read sections. Note, interface config changes (via proc) can occur while pktgen is running, which worries me a bit. I'm assuming proc_remove() takes appropriate locks, to assure no writers exists after proc_remove() finish. I've been running a script exercising the race condition (leading me to fix the proc_remove order), without any issues. The script also exercises concurrent proc writes, while the interface config is getting removed. Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/pktgen.c | 101 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 49 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b61f553689b6..5b05e364b8ae 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,8 +69,9 @@ * for running devices in the if_list and sends packets until count is 0 it * also the thread checks the thread->control which is used for inter-process * communication. controlling process "posts" operations to the threads this - * way. The if_lock should be possible to remove when add/rem_device is merged - * into this too. + * way. + * The if_list is RCU protected, and the if_lock remains to protect updating + * of if_list, from "add_device" as it invoked from userspace (via proc write). * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" @@ -208,7 +209,7 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ -/* If lock -- can be removed after some work */ +/* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -241,6 +242,7 @@ struct pktgen_dev { struct proc_dir_entry *entry; /* proc file */ struct pktgen_thread *pg_thread;/* the owner */ struct list_head list; /* chaining in the thread's run-queue */ + struct rcu_head rcu; /* freed by RCU */ int running; /* if false, the test will stop */ @@ -1737,14 +1739,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) seq_puts(seq, "Running: "); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); seq_puts(seq, "\nStopped: "); - list_for_each_entry(pkt_dev, &t->if_list, list) + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (!pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); @@ -1753,7 +1755,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) else seq_puts(seq, "\nResult: NA\n"); - if_unlock(t); + rcu_read_unlock(); return 0; } @@ -1878,10 +1880,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1931,7 +1931,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; @@ -1946,6 +1947,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d dev->name); break; } + rcu_read_unlock(); } } @@ -2997,8 +2999,8 @@ static void pktgen_run(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { /* * setup odev and create initial packet. @@ -3007,18 +3009,18 @@ static void pktgen_run(struct pktgen_thread *t) if (pkt_dev->odev) { pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); set_pkt_overhead(pkt_dev); strcpy(pkt_dev->result, "Starting"); + pkt_dev->running = 1; /* Cranke yeself! */ started++; } else strcpy(pkt_dev->result, "Error starting"); } - if_unlock(t); + rcu_read_unlock(); if (started) t->control &= ~(T_STOP); } @@ -3041,27 +3043,25 @@ static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) { + rcu_read_unlock(); return 1; + } + rcu_read_unlock(); return 0; } static int pktgen_wait_thread_run(struct pktgen_thread *t) { - if_lock(t); - while (thread_is_running(t)) { - if_unlock(t); - msleep_interruptible(100); if (signal_pending(current)) goto signal; - if_lock(t); } - if_unlock(t); return 1; signal: return 0; @@ -3166,10 +3166,10 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) return -EINVAL; } + pkt_dev->running = 0; kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; pkt_dev->stopped_at = ktime_get(); - pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3180,9 +3180,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev, *best = NULL; - if_lock(t); - - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; if (best == NULL) @@ -3190,7 +3189,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) best = pkt_dev; } - if_unlock(t); + rcu_read_unlock(); + return best; } @@ -3200,13 +3200,13 @@ static void pktgen_stop(struct pktgen_thread *t) func_enter(); - if_lock(t); + rcu_read_lock(); - list_for_each_entry(pkt_dev, &t->if_list, list) { + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); } - if_unlock(t); + rcu_read_unlock(); } /* @@ -3220,8 +3220,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3235,8 +3233,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } static void pktgen_rem_all_ifs(struct pktgen_thread *t) @@ -3248,8 +3244,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3258,8 +3252,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3482,8 +3474,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, struct pktgen_dev *p, *pkt_dev = NULL; size_t len = strlen(ifname); - if_lock(t); - list_for_each_entry(p, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(p, &t->if_list, list) if (strncmp(p->odevname, ifname, len) == 0) { if (p->odevname[len]) { if (exact || p->odevname[len] != '@') @@ -3493,7 +3485,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, break; } - if_unlock(t); + rcu_read_unlock(); pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3507,6 +3499,12 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; + /* This function cannot be called concurrently, as its called + * under pktgen_thread_lock mutex, but it can run from + * userspace on another CPU than the kthread. The if_lock() + * is used here to sync with concurrent instances of + * _rem_dev_from_if_list() invoked via kthread, which is also + * updating the if_list */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3515,9 +3513,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, goto out; } - list_add(&pkt_dev->list, &t->if_list); - pkt_dev->pg_thread = t; pkt_dev->running = 0; + pkt_dev->pg_thread = t; + list_add_rcu(&pkt_dev->list, &t->if_list); out: if_unlock(t); @@ -3672,11 +3670,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, struct list_head *q, *n; struct pktgen_dev *p; + if_lock(t); list_for_each_safe(q, n, &t->if_list) { p = list_entry(q, struct pktgen_dev, list); if (p == pkt_dev) - list_del(&p->list); + list_del_rcu(&p->list); } + if_unlock(t); } static int pktgen_remove_device(struct pktgen_thread *t, @@ -3696,20 +3696,22 @@ static int pktgen_remove_device(struct pktgen_thread *t, pkt_dev->odev = NULL; } - /* And update the thread if_list */ - - _rem_dev_from_if_list(t, pkt_dev); - + /* Remove proc before if_list entry, because add_device uses + * list to determine if interface already exist, avoid race + * with proc_create_data() */ if (pkt_dev->entry) proc_remove(pkt_dev->entry); + /* And update the thread if_list */ + _rem_dev_from_if_list(t, pkt_dev); + #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif vfree(pkt_dev->flows); if (pkt_dev->page) put_page(pkt_dev->page); - kfree(pkt_dev); + kfree_rcu(pkt_dev, rcu); return 0; } @@ -3809,6 +3811,7 @@ static void __exit pg_cleanup(void) { unregister_netdevice_notifier(&pktgen_notifier_block); unregister_pernet_subsys(&pg_net_ops); + /* Don't need rcu_barrier() due to use of kfree_rcu() */ } module_init(pg_init); -- cgit From b9c701edc7ff6ddd522497b0194b0bc3ec1b51a9 Mon Sep 17 00:00:00 2001 From: Stefan Sørensen Date: Fri, 27 Jun 2014 11:59:09 +0200 Subject: net: Simplify ptp class checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace two switch statements enumerating all valid ptp classes with an if statement matching for not PTP_CLASS_NONE. Signed-off-by: Stefan Sørensen Signed-off-by: David S. Miller --- net/core/timestamping.c | 57 +++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 6521dfd8b7c8..a8770391ea5b 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -43,31 +43,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) return; type = classify(skb); + if (type == PTP_CLASS_NONE) + return; + + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) - return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; - phydev->drv->txtstamp(phydev, clone, type); + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return; } - break; - default: - break; + + clone->sk = sk; + phydev->drv->txtstamp(phydev, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); @@ -114,20 +105,12 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) __skb_pull(skb, ETH_HLEN); - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); - break; - default: - break; - } + if (type == PTP_CLASS_NONE) + return false; + + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); return false; } -- cgit From ae5c6c6d7bcadfbedefb5fc8ff0ebe2bfa83a0a1 Mon Sep 17 00:00:00 2001 From: Stefan Sørensen Date: Fri, 27 Jun 2014 11:59:10 +0200 Subject: ptp: Classify ptp over ip over vlan packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extends the ptp bpf to also match ptp over ip over vlan packets. The ptp classes are changed to orthogonal bitfields representing version, transport and vlan values to simplify matching. Signed-off-by: Stefan Sørensen Signed-off-by: David S. Miller --- net/core/ptp_classifier.c | 64 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index d3027a73fd4b..12ab7b4be609 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -52,14 +52,43 @@ * test_8021q: * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? * ldh [16] ; load inner type - * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ? * ldb [18] ; load payload * and #0x8 ; as we don't have ports here, test * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [18] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x40 ; PTP_CLASS_V2_VLAN + * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * ret a ; return PTP class + * + * ; PTP over UDP over IPv4 over 802.1Q over Ethernet + * test_8021q_ipv4: + * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ? + * ldb [27] ; load proto + * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ? + * ldh [24] ; load frag offset field + * jset #0x1fff, drop_8021q_ipv4; don't allow fragments + * ldxb 4*([18]&0xf) ; load IP header len + * ldh [x + 20] ; load UDP dst port + * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 26] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over 802.1Q over Ethernet + * test_8021q_ipv6: + * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ? + * ldb [24] ; load proto + * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ? + * ldh [60] ; load UDP dst port + * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? + * ldh [66] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 * ret a ; return PTP class + * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE * * ; PTP over Ethernet * test_ieee1588: @@ -113,16 +142,39 @@ void __init ptp_classifier_init(void) { 0x44, 0, 0, 0x00000020 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, - { 0x15, 0, 9, 0x00008100 }, + { 0x15, 0, 32, 0x00008100 }, { 0x28, 0, 0, 0x00000010 }, - { 0x15, 0, 15, 0x000088f7 }, + { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x00000008 }, - { 0x15, 0, 12, 0x00000000 }, + { 0x15, 0, 35, 0x00000000 }, { 0x28, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000040 }, + { 0x44, 0, 0, 0x00000070 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x0000001b }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000018 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x00000012 }, + { 0x48, 0, 0, 0x00000014 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x0000001a }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000050 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000018 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x0000003c }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x00000042 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000060 }, { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x0000000e }, { 0x54, 0, 0, 0x00000008 }, -- cgit From 5ed20a68cd6ca4adc0aa2d240913d604a2eb3e25 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:05 -0700 Subject: flow_dissector: Abstract out hash computation Move the hash computation located in __skb_get_hash to be a separate function which takes flow_keys as input. This will allow flow hash computation in other contexts where we only have addresses and ports. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c2b53c1b21d2..2ff8cd4dfc5f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -202,6 +202,33 @@ static __always_inline u32 __flow_hash_1word(u32 a) return jhash_1word(a, hashrnd); } +static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +{ + u32 hash; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys->dst < (__force u32)keys->src) || + (((__force u32)keys->dst == (__force u32)keys->src) && + ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { + swap(keys->dst, keys->src); + swap(keys->port16[0], keys->port16[1]); + } + + hash = __flow_hash_3words((__force u32)keys->dst, + (__force u32)keys->src, + (__force u32)keys->ports); + if (!hash) + hash = 1; + + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + return __flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(flow_hash_from_keys); + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value @@ -211,7 +238,6 @@ static __always_inline u32 __flow_hash_1word(u32 a) void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; if (!skb_flow_dissect(skb, &keys)) return; @@ -219,21 +245,7 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = __flow_hash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports); - if (!hash) - hash = 1; - - skb->hash = hash; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash); -- cgit From 0e001614e849b68cff94cda8db8b550569d3dba6 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:27 -0700 Subject: net: Call skb_get_hash in get_xps_queue and __skb_tx_hash Call standard function to get a packet hash instead of taking this from skb->sk->sk_hash or only using skb->protocol. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ff8cd4dfc5f..62d1cb624f53 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -196,12 +196,6 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) return jhash_3words(a, b, c, hashrnd); } -static __always_inline u32 __flow_hash_1word(u32 a) -{ - __flow_hash_secret_init(); - return jhash_1word(a, hashrnd); -} - static inline u32 __flow_hash_from_keys(struct flow_keys *keys) { u32 hash; @@ -253,7 +247,7 @@ EXPORT_SYMBOL(__skb_get_hash); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; @@ -273,13 +267,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = __flow_hash_1word(hash); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; + return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); @@ -351,17 +339,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map) { if (map->len == 1) queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->hash; - hash = __flow_hash_1word(hash); + else queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } + ((u64)skb_get_hash(skb) * map->len) >> 32]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } -- cgit From 19469a873bafd4e65daef3597db2bd724c1b03c9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:01 -0700 Subject: flow_dissector: Use IPv6 flow label in flow_dissector This patch implements the receive side to support RFC 6438 which is to use the flow label as an ECMP hash. If an IPv6 flow label is set in a packet we can use this as input for computing an L4-hash. There should be no need to parse any transport headers in this case. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 62d1cb624f53..c5f3912dad4c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -80,6 +80,8 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; + __be32 flow_label; + ipv6: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) @@ -89,6 +91,21 @@ ipv6: flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + + flow_label = ip6_flowlabel(iph); + if (flow_label) { + /* Awesome, IPv6 packet has a flow label so we can + * use that to represent the ports without any + * further dissection. + */ + flow->n_proto = proto; + flow->ip_proto = ip_proto; + flow->ports = flow_label; + flow->thoff = (u16)nhoff; + + return true; + } + break; } case htons(ETH_P_8021AD): -- cgit From a3b18ddb9cc1056eea24e3edc1828cfb3fd0726f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:17 -0700 Subject: net: Only do flow_dissector hash computation once per packet Add sw_hash flag to skbuff to indicate that skb->hash was computed from flow_dissector. This flag is checked in skb_get_hash to avoid repeatedly trying to compute the hash (ie. in the case that no L4 hash can be computed). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c5f3912dad4c..5f362c1d0332 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -256,6 +256,8 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; + skb->sw_hash = 1; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash); -- cgit From 9f12fbe603f7ae346b2b46008e325f0c9a68e55d Mon Sep 17 00:00:00 2001 From: Zi Shen Lim Date: Thu, 3 Jul 2014 07:56:54 -0700 Subject: net: filter: move load_pointer() into filter.h load_pointer() is already a static inline function. Let's move it into filter.h so BPF JIT implementations can reuse this function. Since we're exporting this function, let's also rename it to bpf_load_pointer() for clarity. Signed-off-by: Zi Shen Lim Reviewed-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 1dbf6462f766..87af1e3e56c0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -84,15 +84,6 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -static inline void *load_pointer(const struct sk_buff *skb, int k, - unsigned int size, void *buffer) -{ - if (k >= 0) - return skb_header_pointer(skb, k, size, buffer); - - return bpf_internal_load_pointer_neg_helper(skb, k, size); -} - /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff @@ -537,7 +528,7 @@ load_word: * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness */ - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); if (likely(ptr != NULL)) { BPF_R0 = get_unaligned_be32(ptr); CONT; @@ -547,7 +538,7 @@ load_word: LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ off = IMM; load_half: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); if (likely(ptr != NULL)) { BPF_R0 = get_unaligned_be16(ptr); CONT; @@ -557,7 +548,7 @@ load_half: LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ off = IMM; load_byte: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); if (likely(ptr != NULL)) { BPF_R0 = *(u8 *)ptr; CONT; -- cgit From efa95b01da18ad22af62f6d99a3243f3be8fd264 Mon Sep 17 00:00:00 2001 From: david decotigny Date: Tue, 8 Jul 2014 15:14:41 -0700 Subject: netpoll: fix use after free After a bonding master reclaims the netpoll info struct, slaves could still hold a pointer to the reclaimed data. This patch fixes it: as soon as netpoll_async_cleanup is called for a slave (eg. when un-enslaved), we make sure that this slave doesn't point to the data. Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e33937fb32a0..907fb5e36c02 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -822,7 +822,8 @@ void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); - } + } else + RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); -- cgit From 5d5eacb34c9e1fdc0a47b885d832eaa4de860dc7 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 10 Jul 2014 07:01:58 -0400 Subject: bridge: fdb dumping takes a filter device Dumping a bridge fdb dumps every fdb entry held. With this change we are going to filter on selected bridge port. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 27acaf7ff6d7..90a906e7ac26 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2517,6 +2517,7 @@ skip: int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, + struct net_device *filter_dev, int idx) { int err; @@ -2547,13 +2548,15 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) br_dev = netdev_master_upper_dev_get(dev); ops = br_dev->netdev_ops; if (ops->ndo_fdb_dump) - idx = ops->ndo_fdb_dump(skb, cb, dev, idx); + idx = ops->ndo_fdb_dump(skb, cb, dev, NULL, + idx); } if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); + idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + idx); else - idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); + idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); } rcu_read_unlock(); -- cgit From 5e6d243587990a588143b9da3974833649595587 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 10 Jul 2014 07:01:59 -0400 Subject: bridge: netlink dump interface at par with brctl Actually better than brctl showmacs because we can filter by bridge port in the kernel. The current bridge netlink interface doesnt scale when you have many bridges each with large fdbs or even bridges with many bridge ports And now for the science non-fiction novel you have all been waiting for.. //lets see what bridge ports we have root@moja-1:/configs/may30-iprt/bridge# ./bridge link show 8: eth1 state DOWN : mtu 1500 master br0 state disabled priority 32 cost 19 17: sw1-p1 state DOWN : mtu 1500 master br0 state disabled priority 32 cost 100 // show all.. root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show 33:33:00:00:00:01 dev bond0 self permanent 33:33:00:00:00:01 dev dummy0 self permanent 33:33:00:00:00:01 dev ifb0 self permanent 33:33:00:00:00:01 dev ifb1 self permanent 33:33:00:00:00:01 dev eth0 self permanent 01:00:5e:00:00:01 dev eth0 self permanent 33:33:ff:22:01:01 dev eth0 self permanent 02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:07 dev eth1 self permanent 33:33:00:00:00:01 dev eth1 self permanent 33:33:00:00:00:01 dev gretap0 self permanent da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent 33:33:00:00:00:01 dev sw1-p1 self permanent //filter by bridge root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0 02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:07 dev eth1 self permanent 33:33:00:00:00:01 dev eth1 self permanent da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent 33:33:00:00:00:01 dev sw1-p1 self permanent // bridge sw1 has no ports attached.. root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br sw1 //filter by port root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show brport eth1 02:00:00:12:01:02 vlan 0 master br0 permanent 00:17:42:8a:b4:05 vlan 0 master br0 permanent 00:17:42:8a:b4:07 self permanent 33:33:00:00:00:01 self permanent // filter by port + bridge root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0 brport sw1-p1 da:ac:46:27:d9:53 vlan 0 master br0 permanent 33:33:00:00:00:01 self permanent // for shits and giggles (as they say in New Brunswick), lets // change the mac that br0 uses // Note: a magical fdb entry with no brport is added ... root@moja-1:/configs/may30-iprt/bridge# ip link set dev br0 address 02:00:00:12:01:04 // lets see if we can see the unicorn .. root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show 33:33:00:00:00:01 dev bond0 self permanent 33:33:00:00:00:01 dev dummy0 self permanent 33:33:00:00:00:01 dev ifb0 self permanent 33:33:00:00:00:01 dev ifb1 self permanent 33:33:00:00:00:01 dev eth0 self permanent 01:00:5e:00:00:01 dev eth0 self permanent 33:33:ff:22:01:01 dev eth0 self permanent 02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:07 dev eth1 self permanent 33:33:00:00:00:01 dev eth1 self permanent 33:33:00:00:00:01 dev gretap0 self permanent 02:00:00:12:01:04 dev br0 vlan 0 master br0 permanent <=== there it is da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent 33:33:00:00:00:01 dev sw1-p1 self permanent //can we see it if we filter by bridge? root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0 02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent 00:17:42:8a:b4:07 dev eth1 self permanent 33:33:00:00:00:01 dev eth1 self permanent 02:00:00:12:01:04 dev br0 vlan 0 master br0 permanent <=== there it is da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent 33:33:00:00:00:01 dev sw1-p1 self permanent Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 74 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 90a906e7ac26..1f8a59e02c48 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2535,30 +2535,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0; - struct net *net = sock_net(skb->sk); struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *bdev = NULL; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct ifinfomsg *ifm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + int brport_idx = 0; + int br_idx = 0; + int idx = 0; + + if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy) == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + brport_idx = ifm->ifi_index; + + if (br_idx) { + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) + return -ENODEV; + + ops = br_dev->netdev_ops; + bdev = br_dev; + } + + for_each_netdev(net, dev) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; + + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + + bdev = dev; + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; + + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + + bdev = br_dev; + cops = ops; + } - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *br_dev; - const struct net_device_ops *ops; - - br_dev = netdev_master_upper_dev_get(dev); - ops = br_dev->netdev_ops; - if (ops->ndo_fdb_dump) - idx = ops->ndo_fdb_dump(skb, cb, dev, NULL, - idx); + if (cops && cops->ndo_fdb_dump) + idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + idx); } + idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev, idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + + cops = NULL; } - rcu_read_unlock(); cb->args[0] = idx; return skb->len; -- cgit From ec31a05c4dfa95149b1754d9de92831a5a95c636 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 12 Jul 2014 15:49:16 +0200 Subject: net: filter: sk_chk_filter() no longer mangles filter Add const attribute to filter argument to make clear it is no longer modified. Signed-off-by: Eric Dumazet Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 87af1e3e56c0..b90ae7fb3b89 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1085,7 +1085,7 @@ err: * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ -static int check_load_and_stores(struct sock_filter *filter, int flen) +static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; @@ -1218,7 +1218,7 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, unsigned int flen) +int sk_chk_filter(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; @@ -1228,7 +1228,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { - struct sock_filter *ftest = &filter[pc]; + const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) -- cgit From 685343fc3ba61a1f6eef361b786601123db16c28 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:22 +0200 Subject: net: add name_assign_type netdev attribute Based on a patch by David Herrmann. The name_assign_type attribute gives hints where the interface name of a given net-device comes from. These values are currently defined: NET_NAME_ENUM: The ifname is provided by the kernel with an enumerated suffix, typically based on order of discovery. Names may be reused and unpredictable. NET_NAME_PREDICTABLE: The ifname has been assigned by the kernel in a predictable way that is guaranteed to avoid reuse and always be the same for a given device. Examples include statically created devices like the loopback device and names deduced from hardware properties (including being given explicitly by the firmware). Names depending on the order of discovery, or in any other way on the existence of other devices, must not be marked as PREDICTABLE. NET_NAME_USER: The ifname was provided by user-space during net-device setup. NET_NAME_RENAMED: The net-device has been renamed from userspace. Once this type is set, it cannot change again. NET_NAME_UNKNOWN: This is an internal placeholder to indicate that we yet haven't yet categorized the name. It will not be exposed to userspace, rather -EINVAL is returned. The aim of these patches is to improve user-space renaming of interfaces. As a general rule, userspace must rename interfaces to guarantee that names stay the same every time a given piece of hardware appears (at boot, or when attaching it). However, there are several situations where userspace should not perform the renaming, and that depends on both the policy of the local admin, but crucially also on the nature of the current interface name. If an interface was created in repsonse to a userspace request, and userspace already provided a name, we most probably want to leave that name alone. The main instance of this is wifi-P2P devices created over nl80211, which currently have a long-standing bug where they are getting renamed by udev. We label such names NET_NAME_USER. If an interface, unbeknown to us, has already been renamed from userspace, we most probably want to leave also that alone. This will typically happen when third-party plugins (for instance to udev, but the interface is generic so could be from anywhere) renames the interface without informing udev about it. A typical situation is when you switch root from an installer or an initrd to the real system and the new instance of udev does not know what happened before the switch. These types of problems have caused repeated issues in the past. To solve this, once an interface has been renamed, its name is labelled NET_NAME_RENAMED. In many cases, the kernel is actually able to name interfaces in such a way that there is no need for userspace to rename them. This is the case when the enumeration order of devices, or in fact any other (non-parent) device on the system, can not influence the name of the interface. Examples include statically created devices, or any naming schemes based on hardware properties of the interface. In this case the admin may prefer to use the kernel-provided names, and to make that possible we label such names NET_NAME_PREDICTABLE. We want the kernel to have tho possibilty of performing predictable interface naming itself (and exposing to userspace that it has), as the information necessary for a proper naming scheme for a certain class of devices may not be exposed to userspace. The case where renaming is almost certainly desired, is when the kernel has given the interface a name using global device enumeration based on order of discovery (ethX, wlanY, etc). These naming schemes are labelled NET_NAME_ENUM. Lastly, a fallback is left as NET_NAME_UNKNOWN, to indicate that a driver has not yet been ported. This is mostly useful as a transitionary measure, allowing us to label the various naming schemes bit by bit. v8: minor documentation fixes v9: move comment to the right commit Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Reviewed-by: Kay Sievers Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1cac29ebb05b..7752f2ad49a5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -112,6 +112,25 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t format_name_assign_type(const struct net_device *net, char *buf) +{ + return sprintf(buf, fmt_dec, net->name_assign_type); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (net->name_assign_type != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, -- cgit From 238fa3623a5709d29673ed78ff8e714d040fbb89 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:23 +0200 Subject: net: set name assign type for renamed devices Based on a patch from David Herrmann. This is the only place devices can be renamed. v9: restore revers-christmas-tree order of local variables Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6e2a2cd82321..38793fb84a35 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1082,6 +1082,7 @@ static int dev_get_valid_name(struct net *net, */ int dev_change_name(struct net_device *dev, const char *newname) { + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; @@ -1109,10 +1110,14 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; } + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; write_seqcount_end(&devnet_rename_seq); return ret; } @@ -1141,6 +1146,8 @@ rollback: write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", -- cgit From c835a677331495cf137a7f8a023463afd9f032f8 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:24 +0200 Subject: net: set name_assign_type in alloc_netdev() Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert all users to pass NET_NAME_UNKNOWN. Coccinelle patch: @@ expression sizeof_priv, name, setup, txqs, rxqs, count; @@ ( -alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs) +alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs) | -alloc_netdev_mq(sizeof_priv, name, setup, count) +alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count) | -alloc_netdev(sizeof_priv, name, setup) +alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup) ) v9: move comments here from the wrong commit Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Signed-off-by: David S. Miller --- net/core/dev.c | 13 ++++++++----- net/core/rtnetlink.c | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 38793fb84a35..2c98f10ee62a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6441,17 +6441,19 @@ void netdev_freemem(struct net_device *dev) /** * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { @@ -6530,6 +6532,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1f8a59e02c48..599864322de8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1828,8 +1828,8 @@ struct net_device *rtnl_create_link(struct net *net, num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, - num_tx_queues, num_rx_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, NET_NAME_UNKNOWN, + ops->setup, num_tx_queues, num_rx_queues); if (!dev) goto err; -- cgit From 5517750f058edd111bcabe5e116056cc63b1f39c Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:25 +0200 Subject: net: rtnetlink - make create_link take name_assign_type This passes down NET_NAME_USER (or NET_NAME_ENUM) to alloc_netdev(), for any device created over rtnetlink. v9: restore reverse-christmas-tree order of local variables Signed-off-by: Tom Gundersen Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 599864322de8..e9918020dbc9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1810,7 +1810,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, - char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) + char *ifname, unsigned char name_assign_type, + const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -1828,7 +1829,7 @@ struct net_device *rtnl_create_link(struct net *net, num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mqs(ops->priv_size, ifname, NET_NAME_UNKNOWN, + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) goto err; @@ -1894,6 +1895,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + unsigned char name_assign_type = NET_NAME_USER; int err; #ifdef CONFIG_MODULES @@ -2046,14 +2048,16 @@ replay: if (!ops->setup) return -EOPNOTSUPP; - if (!ifname[0]) + if (!ifname[0]) { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } dest_net = rtnl_link_get_net(net, tb); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - dev = rtnl_create_link(dest_net, ifname, ops, tb); + dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; -- cgit From aee944ddf893e14a148af0f4e857c11887660053 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 14 Jul 2014 18:30:56 +0200 Subject: pktgen: remove unnecessary break after goto Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5b05e364b8ae..8b849ddfef2e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -804,7 +804,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\t': case ' ': goto done_str; - break; default: break; } -- cgit From 4d3520cb529e520554906573e4be0efaa38050fe Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 14 Jul 2014 18:30:57 +0200 Subject: drop_monitor: remove unnecessary break after return Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index e70301eb7a4a..50f9a9db5792 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb, switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return set_all_monitor_traces(TRACE_ON); - break; case NET_DM_CMD_STOP: return set_all_monitor_traces(TRACE_OFF); - break; } return -ENOTSUPP; -- cgit From a40e0a664bce465a3b8ad1d792153cef8ded9f7d Mon Sep 17 00:00:00 2001 From: françois romieu Date: Tue, 15 Jul 2014 23:55:35 +0200 Subject: net: remove open-coded skb_cow_head. Signed-off-by: Francois Romieu Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 138ab897de7d..239722af098d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2421,8 +2421,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_warn_bad_offload(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_cow_head(skb, 0); + if (err < 0) return ERR_PTR(err); } -- cgit From c8a89c4a1d58230192cf7243520bf7f9899239f4 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 15 Jul 2014 15:15:20 -0700 Subject: rtnetlink: Drop unnecessary return value from ndo_dflt_fdb_del This change cleans up ndo_dflt_fdb_del to drop the ENOTSUPP return value since that isn't actually returned anywhere in the code. As a result we are able to drop a few lines by just defaulting this to -EINVAL. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e9918020dbc9..8d39071f32d7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2392,22 +2392,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev, const unsigned char *addr) { - int err = -EOPNOTSUPP; + int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { pr_info("%s: FDB only supports static addresses\n", dev->name); - return -EINVAL; + return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); - else - err = -EINVAL; return err; } -- cgit From ccc7f4968a18b980994e622006b84e0195754390 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Thu, 17 Jul 2014 19:46:10 +0200 Subject: net: print net_device reg_state in netdev_* unless it's registered This way we'll always know in what status the device is, unless it's running normally (i.e. NETDEV_REGISTERED). Also, emit a warning once in case of a bad reg_state. CC: "David S. Miller" CC: Jason Baron CC: Eric Dumazet CC: Vlad Yasevich CC: stephen hemminger CC: Jerry Chu CC: Ben Hutchings CC: Joe Perches Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 239722af098d..81d61014fd9b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6950,12 +6950,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev, if (dev && dev->dev.parent) { r = dev_printk_emit(level[1] - '0', dev->dev.parent, - "%s %s %s: %pV", + "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), - netdev_name(dev), vaf); + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + r = printk("%s%s%s: %pV", level, netdev_name(dev), + netdev_reg_state(dev), vaf); } else { r = printk("%s(NULL net_device): %pV", level, vaf); } -- cgit From 6fe82a39e583a50f28f03b294df79c9de9ec0de4 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Thu, 17 Jul 2014 20:33:32 +0200 Subject: net: print a notification on device rename Currently it's done silently (from the kernel part), and thus it might be hard to track the renames from logs. Add a simple netdev_info() to notify the rename, but only in case the previous name was valid. CC: "David S. Miller" CC: Eric Dumazet CC: Vlad Yasevich CC: stephen hemminger CC: Jerry Chu CC: Ben Hutchings CC: David Laight Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 81d61014fd9b..e52a3788d18d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1113,6 +1113,9 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; } + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + old_assign_type = dev->name_assign_type; dev->name_assign_type = NET_NAME_RENAMED; -- cgit From 274f482d33a309c87096f2983601ceda2761094e Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Tue, 22 Jul 2014 21:16:51 +0300 Subject: sock: remove skb argument from sk_rcvqueues_full It hasn't been used since commit 0fd7bac(net: relax rcvbuf limits). Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 026e01f70274..ca9b65199d28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } -- cgit From f5bffecda951b59d0d3cdd616d68952abc52bc40 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 22 Jul 2014 23:01:58 -0700 Subject: net: filter: split filter.c into two files BPF is used in several kernel components. This split creates logical boundary between generic eBPF core and the rest kernel/bpf/core.c: eBPF interpreter net/core/filter.c: classic->eBPF converter, classic verifiers, socket filters This patch only moves functions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 511 ------------------------------------------------------ 1 file changed, 511 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index b90ae7fb3b89..1d0e9492e4fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -45,45 +45,6 @@ #include #include -/* Registers */ -#define BPF_R0 regs[BPF_REG_0] -#define BPF_R1 regs[BPF_REG_1] -#define BPF_R2 regs[BPF_REG_2] -#define BPF_R3 regs[BPF_REG_3] -#define BPF_R4 regs[BPF_REG_4] -#define BPF_R5 regs[BPF_REG_5] -#define BPF_R6 regs[BPF_REG_6] -#define BPF_R7 regs[BPF_REG_7] -#define BPF_R8 regs[BPF_REG_8] -#define BPF_R9 regs[BPF_REG_9] -#define BPF_R10 regs[BPF_REG_10] - -/* Named registers */ -#define DST regs[insn->dst_reg] -#define SRC regs[insn->src_reg] -#define FP regs[BPF_REG_FP] -#define ARG1 regs[BPF_REG_ARG1] -#define CTX regs[BPF_REG_CTX] -#define IMM insn->imm - -/* No hurry in this branch - * - * Exported for the bpf jit load helper. - */ -void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) -{ - u8 *ptr = NULL; - - if (k >= SKF_NET_OFF) - ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) - ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) - return ptr; - - return NULL; -} - /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff @@ -126,451 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Base function for offset calculation. Needs to go into .text section, - * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. - */ -noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return 0; -} - -/** - * __sk_run_filter - run a filter on a given context - * @ctx: buffer to run the filter on - * @insn: filter to apply - * - * Decode and apply filter instructions to the skb->data. Return length to - * keep, 0 for none. @ctx is the data we are operating on, @insn is the - * array of filter instructions. - */ -static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; - static const void *jumptable[256] = { - [0 ... 255] = &&default_label, - /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - }; - void *ptr; - int off; - -#define CONT ({ insn++; goto select_insn; }) -#define CONT_JMP ({ insn++; goto select_insn; }) - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - -select_insn: - goto *jumptable[insn->code]; - - /* ALU */ -#define ALU(OPCODE, OP) \ - ALU64_##OPCODE##_X: \ - DST = DST OP SRC; \ - CONT; \ - ALU_##OPCODE##_X: \ - DST = (u32) DST OP (u32) SRC; \ - CONT; \ - ALU64_##OPCODE##_K: \ - DST = DST OP IMM; \ - CONT; \ - ALU_##OPCODE##_K: \ - DST = (u32) DST OP (u32) IMM; \ - CONT; - - ALU(ADD, +) - ALU(SUB, -) - ALU(AND, &) - ALU(OR, |) - ALU(LSH, <<) - ALU(RSH, >>) - ALU(XOR, ^) - ALU(MUL, *) -#undef ALU - ALU_NEG: - DST = (u32) -DST; - CONT; - ALU64_NEG: - DST = -DST; - CONT; - ALU_MOV_X: - DST = (u32) SRC; - CONT; - ALU_MOV_K: - DST = (u32) IMM; - CONT; - ALU64_MOV_X: - DST = SRC; - CONT; - ALU64_MOV_K: - DST = IMM; - CONT; - ALU64_ARSH_X: - (*(s64 *) &DST) >>= SRC; - CONT; - ALU64_ARSH_K: - (*(s64 *) &DST) >>= IMM; - CONT; - ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = DST; - DST = do_div(tmp, SRC); - CONT; - ALU_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); - CONT; - ALU64_MOD_K: - tmp = DST; - DST = do_div(tmp, IMM); - CONT; - ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); - CONT; - ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; - do_div(DST, SRC); - CONT; - ALU_DIV_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; - CONT; - ALU64_DIV_K: - do_div(DST, IMM); - CONT; - ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; - CONT; - ALU_END_TO_BE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_be16(DST); - break; - case 32: - DST = (__force u32) cpu_to_be32(DST); - break; - case 64: - DST = (__force u64) cpu_to_be64(DST); - break; - } - CONT; - ALU_END_TO_LE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_le16(DST); - break; - case 32: - DST = (__force u32) cpu_to_le32(DST); - break; - case 64: - DST = (__force u64) cpu_to_le64(DST); - break; - } - CONT; - - /* CALL */ - JMP_CALL: - /* Function call scratches BPF_R1-BPF_R5 registers, - * preserves BPF_R6-BPF_R9, and stores return value - * into BPF_R0. - */ - BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, - BPF_R4, BPF_R5); - CONT; - - /* JMP */ - JMP_JA: - insn += insn->off; - CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_EXIT: - return BPF_R0; - - /* STX and ST and LDX*/ -#define LDST(SIZEOP, SIZE) \ - STX_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ - CONT; \ - ST_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ - CONT; \ - LDX_MEM_##SIZEOP: \ - DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ - CONT; - - LDST(B, u8) - LDST(H, u16) - LDST(W, u32) - LDST(DW, u64) -#undef LDST - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); - CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, sk_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; - - default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); - return 0; -} - /* Helper to find the offset of pkt_type in sk_buff structure. We want * to make sure its still a 3bit field starting at a byte boundary; * taken from arch/x86/net/bpf_jit_comp.c. @@ -1455,33 +971,6 @@ out_err: return ERR_PTR(err); } -void __weak bpf_int_jit_compile(struct sk_filter *prog) -{ -} - -/** - * sk_filter_select_runtime - select execution runtime for BPF program - * @fp: sk_filter populated with internal BPF program - * - * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via SK_RUN_FILTER() macro - */ -void sk_filter_select_runtime(struct sk_filter *fp) -{ - fp->bpf_func = (void *) __sk_run_filter; - - /* Probe if internal BPF can be JITed */ - bpf_int_jit_compile(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_select_runtime); - -/* free internal BPF program */ -void sk_filter_free(struct sk_filter *fp) -{ - bpf_jit_free(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_free); - static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, struct sock *sk) { -- cgit From 2695fb552cbef1029aa025a98acb80cc51d66de5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 24 Jul 2014 16:38:21 -0700 Subject: net: filter: rename 'struct sock_filter_int' into 'struct bpf_insn' eBPF is used by socket filtering, seccomp and soon by tracing and exposed to userspace, therefore 'sock_filter_int' name is not accurate. Rename it to 'bpf_insn' Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 1d0e9492e4fa..f3b2d5e9fe5f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -174,9 +174,9 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) } static bool convert_bpf_extensions(struct sock_filter *fp, - struct sock_filter_int **insnp) + struct bpf_insn **insnp) { - struct sock_filter_int *insn = *insnp; + struct bpf_insn *insn = *insnp; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -326,7 +326,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * sk_convert_filter(old_prog, old_len, new_prog, &new_len); * * User BPF's register A is mapped to our BPF register 6, user BPF @@ -336,10 +336,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * ctx == 'struct seccomp_data *'. */ int sk_convert_filter(struct sock_filter *prog, int len, - struct sock_filter_int *new_prog, int *new_len) + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; - struct sock_filter_int *new_insn; + struct bpf_insn *new_insn; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -365,8 +365,8 @@ do_pass: new_insn++; for (i = 0; i < len; fp++, i++) { - struct sock_filter_int tmp_insns[6] = { }; - struct sock_filter_int *insn = tmp_insns; + struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - new_prog; @@ -913,7 +913,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, * representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != - sizeof(struct sock_filter_int)); + sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd @@ -945,7 +945,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, fp->len = new_len; - /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ + /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); if (err) /* 2nd sk_convert_filter() can fail only if it fails -- cgit From 6b53dafe23fd1f1228c7dd9b8a1323e757966160 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 23 Jul 2014 16:09:10 -0700 Subject: net: do not name the pointer to struct net_device net "net" is normally for struct net*, pointer to struct net_device should be named to either "dev" or "ndev" etc. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 142 +++++++++++++++++++++++++-------------------------- 1 file changed, 71 insertions(+), 71 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7752f2ad49a5..9dd06699b09c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = (*format)(net, buf); + if (dev_isalive(ndev)) + ret = (*format)(ndev, buf); read_unlock(&dev_base_lock); return ret; @@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev, /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ -static ssize_t format_##field(const struct net_device *net, char *buf) \ +static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, net->field); \ + return sprintf(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -112,19 +112,19 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); -static ssize_t format_name_assign_type(const struct net_device *net, char *buf) +static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sprintf(buf, fmt_dec, net->name_assign_type); + return sprintf(buf, fmt_dec, dev->name_assign_type); } static ssize_t name_assign_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (net->name_assign_type != NET_NAME_UNKNOWN) + if (ndev->name_assign_type != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; @@ -135,12 +135,12 @@ static DEVICE_ATTR_RO(name_assign_type); static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); read_unlock(&dev_base_lock); return ret; } @@ -149,18 +149,18 @@ static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); - if (dev_isalive(net)) - return sysfs_format_mac(buf, net->broadcast, net->addr_len); + struct net_device *ndev = to_net_dev(dev); + if (dev_isalive(ndev)) + return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); return -EINVAL; } static DEVICE_ATTR_RO(broadcast); -static int change_carrier(struct net_device *net, unsigned long new_carrier) +static int change_carrier(struct net_device *dev, unsigned long new_carrier) { - if (!netif_running(net)) + if (!netif_running(dev)) return -EINVAL; - return dev_change_carrier(net, (bool) new_carrier); + return dev_change_carrier(dev, (bool) new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, @@ -284,9 +284,9 @@ static DEVICE_ATTR_RO(carrier_changes); /* read-write attributes */ -static int change_mtu(struct net_device *net, unsigned long new_mtu) +static int change_mtu(struct net_device *dev, unsigned long new_mtu) { - return dev_set_mtu(net, (int) new_mtu); + return dev_set_mtu(dev, (int) new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, @@ -296,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(mtu, fmt_dec); -static int change_flags(struct net_device *net, unsigned long new_flags) +static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(net, (unsigned int) new_flags); + return dev_change_flags(dev, (unsigned int) new_flags); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, @@ -308,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - net->tx_queue_len = new_len; + dev->tx_queue_len = new_len; return 0; } @@ -363,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev, } static DEVICE_ATTR_RW(ifalias); -static int change_group(struct net_device *net, unsigned long new_group) +static int change_group(struct net_device *dev, unsigned long new_group) { - dev_set_group(net, (int) new_group); + dev_set_group(dev, (int) new_group); return 0; } @@ -796,20 +796,20 @@ static struct kobj_type rx_queue_ktype = { .namespace = rx_queue_namespace }; -static int rx_queue_add_kobject(struct net_device *net, int index) +static int rx_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto exit; - if (net->sysfs_rx_queue_group) { - error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (dev->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto exit; } @@ -825,18 +825,18 @@ exit: #endif /* CONFIG_SYSFS */ int -net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS - if (!net->sysfs_rx_queue_group) + if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { - error = rx_queue_add_kobject(net, i); + error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -844,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - if (net->sysfs_rx_queue_group) - sysfs_remove_group(&net->_rx[i].kobj, - net->sysfs_rx_queue_group); - kobject_put(&net->_rx[i].kobj); + if (dev->sysfs_rx_queue_group) + sysfs_remove_group(&dev->_rx[i].kobj, + dev->sysfs_rx_queue_group); + kobject_put(&dev->_rx[i].kobj); } return error; @@ -1155,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = { .namespace = netdev_queue_namespace, }; -static int netdev_queue_add_kobject(struct net_device *net, int index) +static int netdev_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_queue *queue = net->_tx + index; + struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) @@ -1184,14 +1184,14 @@ exit: #endif /* CONFIG_SYSFS */ int -netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; for (i = old_num; i < new_num; i++) { - error = netdev_queue_add_kobject(net, i); + error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -1199,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - struct netdev_queue *queue = net->_tx + i; + struct netdev_queue *queue = dev->_tx + i; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1213,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif /* CONFIG_SYSFS */ } -static int register_queue_kobjects(struct net_device *net) +static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) + dev->queues_kset = kset_create_and_add("queues", + NULL, &dev->dev.kobj); + if (!dev->queues_kset) return -ENOMEM; - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - error = net_rx_queue_update_kobjects(net, 0, real_rx); + error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; - error = netdev_queue_update_kobjects(net, 0, real_tx); + error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; @@ -1239,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net) return 0; error: - netdev_queue_update_kobjects(net, txq, 0); - net_rx_queue_update_kobjects(net, rxq, 0); + netdev_queue_update_kobjects(dev, txq, 0); + net_rx_queue_update_kobjects(dev, rxq, 0); return error; } -static void remove_queue_kobjects(struct net_device *net) +static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - net_rx_queue_update_kobjects(net, real_rx, 0); - netdev_queue_update_kobjects(net, real_tx, 0); + net_rx_queue_update_kobjects(dev, real_rx, 0); + netdev_queue_update_kobjects(dev, real_tx, 0); #ifdef CONFIG_SYSFS - kset_unregister(net->queues_kset); + kset_unregister(dev->queues_kset); #endif } @@ -1349,13 +1349,13 @@ static struct class net_class = { /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ -void netdev_unregister_kobject(struct net_device * net) +void netdev_unregister_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); + struct device *dev = &(ndev->dev); kobject_get(&dev->kobj); - remove_queue_kobjects(net); + remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); @@ -1363,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net) } /* Create sysfs entries for network device. */ -int netdev_register_kobject(struct net_device *net) +int netdev_register_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); - const struct attribute_group **groups = net->sysfs_groups; + struct device *dev = &(ndev->dev); + const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; - dev->platform_data = net; + dev->platform_data = ndev; dev->groups = groups; - dev_set_name(dev, "%s", net->name); + dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ @@ -1384,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net) *groups++ = &netstat_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (net->ieee80211_ptr) + if (ndev->ieee80211_ptr) *groups++ = &wireless_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (net->wireless_handlers) + else if (ndev->wireless_handlers) *groups++ = &wireless_group; #endif #endif @@ -1397,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; - error = register_queue_kobjects(net); + error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; -- cgit From 4d276eb6a478307a28ae843836c455bf04b37a3c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 25 Jul 2014 18:01:32 -0400 Subject: net: remove deprecated syststamp timestamp The SO_TIMESTAMPING API defines three types of timestamps: software, hardware in raw format (hwtstamp) and hardware converted to system format (syststamp). The last has been deprecated in favor of combining hwtstamp with a PTP clock driver. There are no active users in the kernel. The option was device driver dependent. If set, but without hardware support, the correct behavior is to return zero in the relevant field in the SCM_TIMESTAMPING ancillary message. Without device drivers implementing the option, this field is effectively always zero. Remove the internal plumbing to dissuage new drivers from implementing the feature. Keep the SOF_TIMESTAMPING_SYS_HARDWARE flag, however, to avoid breaking existing applications that request the timestamp. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index ca9b65199d28..134291d73fcd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -862,8 +862,6 @@ set_rcvbuf: (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, - val & SOF_TIMESTAMPING_SYS_HARDWARE); sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, val & SOF_TIMESTAMPING_RAW_HARDWARE); break; @@ -1102,8 +1100,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) - v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; break; -- cgit From 80019d310f9fb4f8c9eeda0a5d76144ad3132fdf Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 30 Jul 2014 02:31:08 +0200 Subject: net: Remove unlikely() for WARN_ON() conditions No need for the unlikely(), WARN_ON() and BUG_ON() internally use unlikely() on the condition. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e1b7cfaccd65..b370230fe1d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2326,7 +2326,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) */ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { if (vlan_depth) { - if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { -- cgit From 34c5bd66e5ed2268bcb917b4cbdd6317023eada4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 29 Jul 2014 17:36:28 +0200 Subject: net: filter: don't release unattached filter through call_rcu() sk_unattached_filter_destroy() does not always need to release the filter object via rcu. Since this filter is never attached to the socket, the caller should be responsible for releasing the filter in a safe way, which may not necessarily imply rcu. This is a short summary of clients of this function: 1) xt_bpf.c and cls_bpf.c use the bpf matchers from rules, these rules are removed from the packet path before the filter is released. Thus, the framework makes sure the filter is safely removed. 2) In the ppp driver, the ppp_lock ensures serialization between the xmit and filter attachment/detachment path. This doesn't use rcu so deferred release via rcu makes no sense. 3) In the isdn/ppp driver, it is called from isdn_ppp_release() the isdn_ppp_ioctl(). This driver uses mutex and spinlocks, no rcu. Thus, deferred rcu makes no sense to me either, the deferred releases may be just masking the effects of wrong locking strategy, which should be fixed in the driver itself. 4) In the team driver, this is the only place where the rcu synchronization with unattached filter is used. Therefore, this patch introduces synchronize_rcu() which is called from the genetlink path to make sure the filter doesn't go away while packets are still walking over it. I think we can revisit this once struct bpf_prog (that only wraps specific bpf code bits) is in place, then add some specific struct rcu_head in the scope of the team driver if Jiri thinks this is needed. Deferred rcu release for unattached filters was originally introduced in 302d663 ("filter: Allow to create sk-unattached filters"). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/filter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index f3b2d5e9fe5f..42c1944b0c63 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -841,6 +841,12 @@ static void sk_release_orig_filter(struct sk_filter *fp) } } +static void __sk_filter_release(struct sk_filter *fp) +{ + sk_release_orig_filter(fp); + sk_filter_free(fp); +} + /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free @@ -849,8 +855,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_release_orig_filter(fp); - sk_filter_free(fp); + __sk_filter_release(fp); } /** @@ -1050,7 +1055,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_create); void sk_unattached_filter_destroy(struct sk_filter *fp) { - sk_filter_release(fp); + __sk_filter_release(fp); } EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); -- cgit From fcdfe3a7fa4cb74391d42b6a26dc07c20dab1d82 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Thu, 31 Jul 2014 10:33:06 -0400 Subject: net: Correctly set segment mac_len in skb_segment(). When performing segmentation, the mac_len value is copied right out of the original skb. However, this value is not always set correctly (like when the packet is VLAN-tagged) and we'll end up copying a bad value. One way to demonstrate this is to configure a VM which tags packets internally and turn off VLAN acceleration on the forwarding bridge port. The packets show up corrupt like this: 16:18:24.985548 52:54:00:ab:be:25 > 52:54:00:26:ce:a3, ethertype 802.1Q (0x8100), length 1518: vlan 100, p 0, ethertype 0x05e0, 0x0000: 8cdb 1c7c 8cdb 0064 4006 b59d 0a00 6402 ...|...d@.....d. 0x0010: 0a00 6401 9e0d b441 0a5e 64ec 0330 14fa ..d....A.^d..0.. 0x0020: 29e3 01c9 f871 0000 0101 080a 000a e833)....q.........3 0x0030: 000f 8c75 6e65 7470 6572 6600 6e65 7470 ...unetperf.netp 0x0040: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp 0x0050: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp 0x0060: 6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp ... This also leads to awful throughput as GSO packets are dropped and cause retransmissions. The solution is to set the mac_len using the values already available in then new skb. We've already adjusted all of the header offset, so we might as well correctly figure out the mac_len using skb_reset_mac_len(). After this change, packets are segmented correctly and performance is restored. CC: Eric Dumazet Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a33033cbe2..58ff88edbefd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2976,9 +2976,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, tail = nskb; __copy_skb_header(nskb, head_skb); - nskb->mac_len = head_skb->mac_len; skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); + skb_reset_mac_len(nskb); skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, -- cgit From 278571baca2aecf5fb5cb5c8b002dbfa0a6c524c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:12 -0700 Subject: net: filter: simplify socket charging attaching bpf program to a socket involves multiple socket memory arithmetic, since size of 'sk_filter' is changing when classic BPF is converted to eBPF. Also common path of program creation has to deal with two ways of freeing the memory. Simplify the code by delaying socket charging until program is ready and its size is known Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 87 +++++++++++++++++++++++-------------------------------- net/core/sock.c | 9 ++++-- 2 files changed, 44 insertions(+), 52 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 42c1944b0c63..5a6aeb1d40b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -872,41 +872,30 @@ static void sk_filter_release(struct sk_filter *fp) void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { - atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); - sk_filter_release(fp); -} + u32 filter_size = sk_filter_size(fp->len); -void sk_filter_charge(struct sock *sk, struct sk_filter *fp) -{ - atomic_inc(&fp->refcnt); - atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); + atomic_sub(filter_size, &sk->sk_omem_alloc); + sk_filter_release(fp); } -static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, - struct sock *sk, - unsigned int len) +/* try to charge the socket memory if there is space available + * return true on success + */ +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - struct sk_filter *fp_new; - - if (sk == NULL) - return krealloc(fp, len, GFP_KERNEL); - - fp_new = sock_kmalloc(sk, len, GFP_KERNEL); - if (fp_new) { - *fp_new = *fp; - /* As we're keeping orig_prog in fp_new along, - * we need to make sure we're not evicting it - * from the old fp. - */ - fp->orig_prog = NULL; - sk_filter_uncharge(sk, fp); + u32 filter_size = sk_filter_size(fp->len); + + /* same check as in sock_kmalloc() */ + if (filter_size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + atomic_inc(&fp->refcnt); + atomic_add(filter_size, &sk->sk_omem_alloc); + return true; } - - return fp_new; + return false; } -static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, - struct sock *sk) +static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) { struct sock_filter *old_prog; struct sk_filter *old_fp; @@ -938,7 +927,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); + fp = krealloc(old_fp, sk_filter_size(new_len), GFP_KERNEL); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -956,7 +945,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, /* 2nd sk_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released - * by __sk_migrate_realloc(). + * by krealloc(). */ goto out_err_free; @@ -968,16 +957,11 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, out_err_free: kfree(old_prog); out_err: - /* Rollback filter setup. */ - if (sk != NULL) - sk_filter_uncharge(sk, fp); - else - kfree(fp); + __sk_filter_release(fp); return ERR_PTR(err); } -static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, - struct sock *sk) +static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) { int err; @@ -986,10 +970,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, err = sk_chk_filter(fp->insns, fp->len); if (err) { - if (sk != NULL) - sk_filter_uncharge(sk, fp); - else - kfree(fp); + __sk_filter_release(fp); return ERR_PTR(err); } @@ -1002,7 +983,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, * internal BPF translation for the optimized interpreter. */ if (!fp->jited) - fp = __sk_migrate_filter(fp, sk); + fp = __sk_migrate_filter(fp); return fp; } @@ -1041,10 +1022,10 @@ int sk_unattached_filter_create(struct sk_filter **pfp, */ fp->orig_prog = NULL; - /* __sk_prepare_filter() already takes care of uncharging + /* __sk_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp, NULL); + fp = __sk_prepare_filter(fp); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -1083,31 +1064,37 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); + fp = kmalloc(sk_fsize, GFP_KERNEL); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { - sock_kfree_s(sk, fp, sk_fsize); + kfree(fp); return -EFAULT; } - atomic_set(&fp->refcnt, 1); fp->len = fprog->len; err = sk_store_orig_filter(fp, fprog); if (err) { - sk_filter_uncharge(sk, fp); + kfree(fp); return -ENOMEM; } - /* __sk_prepare_filter() already takes care of uncharging + /* __sk_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp, sk); + fp = __sk_prepare_filter(fp); if (IS_ERR(fp)) return PTR_ERR(fp); + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); diff --git a/net/core/sock.c b/net/core/sock.c index 134291d73fcd..a741163568fa 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1474,6 +1474,7 @@ static void sk_update_clone(const struct sock *sk, struct sock *newsk) struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; + bool is_charged = true; newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); if (newsk != NULL) { @@ -1518,9 +1519,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) - sk_filter_charge(newsk, filter); + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; -- cgit From 009937e78a45553a86d26654f192b2fd9ebe289d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:13 -0700 Subject: net: filter: rename sk_filter_proglen -> bpf_classic_proglen trivial rename to better match semantics of macro Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 8 ++++---- net/core/sock_diag.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 5a6aeb1d40b8..d6cb287e4f59 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -813,7 +813,7 @@ EXPORT_SYMBOL(sk_chk_filter); static int sk_store_orig_filter(struct sk_filter *fp, const struct sock_fprog *fprog) { - unsigned int fsize = sk_filter_proglen(fprog); + unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); @@ -1001,7 +1001,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) int sk_unattached_filter_create(struct sk_filter **pfp, struct sock_fprog_kern *fprog) { - unsigned int fsize = sk_filter_proglen(fprog); + unsigned int fsize = bpf_classic_proglen(fprog); struct sk_filter *fp; /* Make sure new filter is there and in the right amounts. */ @@ -1053,7 +1053,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; - unsigned int fsize = sk_filter_proglen(fprog); + unsigned int fsize = bpf_classic_proglen(fprog); unsigned int sk_fsize = sk_filter_size(fprog->len); int err; @@ -1154,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) + if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a4216a4c9572..57d922320c59 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -69,7 +69,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, goto out; fprog = filter->orig_prog; - flen = sk_filter_proglen(fprog); + flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); if (attr == NULL) { -- cgit From 4df95ff488eb796aab9566652c250330179def17 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:14 -0700 Subject: net: filter: rename sk_chk_filter() -> bpf_check_classic() trivial rename to indicate that this functions performs classic BPF checking Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d6cb287e4f59..5740ea08a3ad 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,7 +18,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. - * Kris Katterjohn - Added many additional checks in sk_chk_filter() + * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include @@ -721,7 +721,7 @@ static bool chk_code_allowed(u16 code_to_probe) } /** - * sk_chk_filter - verify socket filter code + * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * @@ -734,7 +734,7 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(const struct sock_filter *filter, unsigned int flen) +int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; @@ -808,7 +808,7 @@ int sk_chk_filter(const struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(sk_chk_filter); +EXPORT_SYMBOL(bpf_check_classic); static int sk_store_orig_filter(struct sk_filter *fp, const struct sock_fprog *fprog) @@ -968,7 +968,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) fp->bpf_func = NULL; fp->jited = 0; - err = sk_chk_filter(fp->insns, fp->len); + err = bpf_check_classic(fp->insns, fp->len); if (err) { __sk_filter_release(fp); return ERR_PTR(err); -- cgit From 8fb575ca396bc31d9fa99c26336e2432b41d1bfc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:15 -0700 Subject: net: filter: rename sk_convert_filter() -> bpf_convert_filter() to indicate that this function is converting classic BPF into eBPF and not related to sockets Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 5740ea08a3ad..6ac901613bee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -312,7 +312,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, } /** - * sk_convert_filter - convert filter program + * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: buffer where converted program will be stored @@ -322,12 +322,12 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * Conversion workflow: * * 1) First pass for calculating the new program length: - * sk_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); - * sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); * * User BPF's register A is mapped to our BPF register 6, user BPF * register X is mapped to BPF register 7; frame pointer is always @@ -335,8 +335,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int sk_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) +int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; struct bpf_insn *new_insn; @@ -921,7 +921,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) } /* 1st pass: calculate the new program length. */ - err = sk_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); if (err) goto out_err_free; @@ -940,9 +940,9 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); if (err) - /* 2nd sk_convert_filter() can fail only if it fails + /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). -- cgit From 7ae457c1e5b45a1b826fad9d62b32191d2bdcfdb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jul 2014 20:34:16 -0700 Subject: net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 92 ++++++++++++++++++++++++++--------------------- net/core/ptp_classifier.c | 6 ++-- net/core/sock_diag.c | 2 +- 3 files changed, 56 insertions(+), 44 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6ac901613bee..d814b8a89d0f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -810,8 +810,8 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) } EXPORT_SYMBOL(bpf_check_classic); -static int sk_store_orig_filter(struct sk_filter *fp, - const struct sock_fprog *fprog) +static int bpf_prog_store_orig_filter(struct bpf_prog *fp, + const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; @@ -831,7 +831,7 @@ static int sk_store_orig_filter(struct sk_filter *fp, return 0; } -static void sk_release_orig_filter(struct sk_filter *fp) +static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; @@ -841,10 +841,16 @@ static void sk_release_orig_filter(struct sk_filter *fp) } } +static void __bpf_prog_release(struct bpf_prog *prog) +{ + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + static void __sk_filter_release(struct sk_filter *fp) { - sk_release_orig_filter(fp); - sk_filter_free(fp); + __bpf_prog_release(fp->prog); + kfree(fp); } /** @@ -872,7 +878,7 @@ static void sk_filter_release(struct sk_filter *fp) void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { - u32 filter_size = sk_filter_size(fp->len); + u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); @@ -883,7 +889,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - u32 filter_size = sk_filter_size(fp->len); + u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && @@ -895,10 +901,10 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) return false; } -static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) +static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; - struct sk_filter *old_fp; + struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; /* We are free to overwrite insns et al right here as it @@ -927,7 +933,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, sk_filter_size(new_len), GFP_KERNEL); + fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -949,7 +955,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) */ goto out_err_free; - sk_filter_select_runtime(fp); + bpf_prog_select_runtime(fp); kfree(old_prog); return fp; @@ -957,11 +963,11 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp) out_err_free: kfree(old_prog); out_err: - __sk_filter_release(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } -static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) { int err; @@ -970,7 +976,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) err = bpf_check_classic(fp->insns, fp->len); if (err) { - __sk_filter_release(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } @@ -983,13 +989,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) * internal BPF translation for the optimized interpreter. */ if (!fp->jited) - fp = __sk_migrate_filter(fp); + fp = bpf_migrate_filter(fp); return fp; } /** - * sk_unattached_filter_create - create an unattached filter + * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * @@ -998,23 +1004,21 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp) * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ -int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog_kern *fprog) +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); - struct sk_filter *fp; + struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); + fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); - atomic_set(&fp->refcnt, 1); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold @@ -1022,23 +1026,23 @@ int sk_unattached_filter_create(struct sk_filter **pfp, */ fp->orig_prog = NULL; - /* __sk_prepare_filter() already takes care of freeing + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp); + fp = bpf_prepare_filter(fp); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } -EXPORT_SYMBOL_GPL(sk_unattached_filter_create); +EXPORT_SYMBOL_GPL(bpf_prog_create); -void sk_unattached_filter_destroy(struct sk_filter *fp) +void bpf_prog_destroy(struct bpf_prog *fp) { - __sk_filter_release(fp); + __bpf_prog_release(fp); } -EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); +EXPORT_SYMBOL_GPL(bpf_prog_destroy); /** * sk_attach_filter - attach a socket filter @@ -1054,7 +1058,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; unsigned int fsize = bpf_classic_proglen(fprog); - unsigned int sk_fsize = sk_filter_size(fprog->len); + unsigned int bpf_fsize = bpf_prog_size(fprog->len); + struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) @@ -1064,29 +1069,36 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(sk_fsize, GFP_KERNEL); - if (!fp) + prog = kmalloc(bpf_fsize, GFP_KERNEL); + if (!prog) return -ENOMEM; - if (copy_from_user(fp->insns, fprog->filter, fsize)) { - kfree(fp); + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + kfree(prog); return -EFAULT; } - fp->len = fprog->len; + prog->len = fprog->len; - err = sk_store_orig_filter(fp, fprog); + err = bpf_prog_store_orig_filter(prog, fprog); if (err) { - kfree(fp); + kfree(prog); return -ENOMEM; } - /* __sk_prepare_filter() already takes care of freeing + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp); - if (IS_ERR(fp)) - return PTR_ERR(fp); + prog = bpf_prepare_filter(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + __bpf_prog_release(prog); + return -ENOMEM; + } + fp->prog = prog; atomic_set(&fp->refcnt, 0); @@ -1142,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. */ - fprog = filter->orig_prog; + fprog = filter->prog->orig_prog; ret = fprog->len; if (!len) diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 12ab7b4be609..4eab4a94a59d 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -107,11 +107,11 @@ #include #include -static struct sk_filter *ptp_insns __read_mostly; +static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return SK_RUN_FILTER(ptp_insns, skb); + return BPF_PROG_RUN(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); @@ -189,5 +189,5 @@ void __init ptp_classifier_init(void) .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, }; - BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); + BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 57d922320c59..ad704c757bb4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -68,7 +68,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, if (!filter) goto out; - fprog = filter->orig_prog; + fprog = filter->prog->orig_prog; flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); -- cgit From f24b9be5957b38bb420b838115040dc2031b7d0c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:45 -0400 Subject: net-timestamp: extend SCM_TIMESTAMPING ancillary data struct Applications that request kernel tx timestamps with SO_TIMESTAMPING read timestamps as recvmsg() ancillary data. The response is defined implicitly as timespec[3]. 1) define struct scm_timestamping explicitly and 2) add support for new tstamp types. On tx, scm_timestamping always accompanies a sock_extended_err. Define previously unused field ee_info to signal the type of ts[0]. Introduce SCM_TSTAMP_SND to define the existing behavior. The reception path is not modified. On rx, no struct similar to sock_extended_err is passed along with SCM_TIMESTAMPING. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a33033cbe2..c9f68802e992 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3521,6 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = SCM_TSTAMP_SND; err = sock_queue_err_skb(sk, skb); -- cgit From b9f40e21ef4298650ab33e35740fa85bd57706d5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:46 -0400 Subject: net-timestamp: move timestamp flags out of sk_flags sk_flags is reaching its limit. New timestamping options will not fit. Move all of them into a new field sk->sk_tsflags. Added benefit is that this removes boilerplate code to convert between SOF_TIMESTAMPING_.. and SOCK_TIMESTAMPING_.. in getsockopt/setsockopt. SOCK_TIMESTAMPING_RX_SOFTWARE is also used to toggle the receive timestamp logic (netstamp_needed). That can be simplified and this last key removed, but will leave that for a separate patch. Signed-off-by: Willem de Bruijn ---- The u16 in sock can be moved into a 16-bit hole below sk_gso_max_segs, though that scatters tstamp fields throughout the struct. Signed-off-by: David S. Miller --- net/core/sock.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index a741163568fa..47c9377e14b9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -848,22 +848,13 @@ set_rcvbuf: ret = -EINVAL; break; } - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, - val & SOF_TIMESTAMPING_TX_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, - val & SOF_TIMESTAMPING_TX_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, - val & SOF_TIMESTAMPING_RX_HARDWARE); + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, - val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, - val & SOF_TIMESTAMPING_RAW_HARDWARE); break; case SO_RCVLOWAT: @@ -1089,19 +1080,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_TIMESTAMPING: - v.val = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_TX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + v.val = sk->sk_tsflags; break; case SO_RCVTIMEO: -- cgit From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:47 -0400 Subject: net-timestamp: add key to disambiguate concurrent datagrams Datagrams timestamped on transmission can coexist in the kernel stack and be reordered in packet scheduling. When reading looped datagrams from the socket error queue it is not always possible to unique correlate looped data with original send() call (for application level retransmits). Even if possible, it may be expensive and complex, requiring packet inspection. Introduce a data-independent ID mechanism to associate timestamps with send calls. Pass an ID alongside the timestamp in field ee_data of sock_extended_err. The ID is a simple 32 bit unsigned int that is associated with the socket and incremented on each send() call for which software tx timestamp generation is enabled. The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to avoid changing ee_data for existing applications that expect it 0. The counter is reset each time the flag is reenabled. Reenabling does not change the ID of already submitted data. It is possible to receive out of order IDs if the timestamp stream is not quiesced first. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 ++ net/core/sock.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c9f68802e992..0df4f1d14c5a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3522,6 +3522,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SND; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + serr->ee.ee_data = skb_shinfo(skb)->tskey; err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 47c9377e14b9..1e0f1c63ad6b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -848,6 +848,9 @@ set_rcvbuf: ret = -EINVAL; break; } + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) + sk->sk_tskey = 0; sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, -- cgit From e7fd2885385157d46c85f282fc6d7d297db43e1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:48 -0400 Subject: net-timestamp: SCHED timestamp on entering packet scheduler Kernel transmit latency is often incurred in the packet scheduler. Introduce a new timestamp on transmission just before entering the scheduler. When data travels through multiple devices (bonding, tunneling, ...) each device will export an individual timestamp. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ net/core/skbuff.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b370230fe1d3..1c15b189c52b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2876,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0df4f1d14c5a..9705c0732aab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) { - struct sock *sk = orig_skb->sk; struct sock_exterr_skb *serr; struct sk_buff *skb; int err; @@ -3521,7 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - serr->ee.ee_info = SCM_TSTAMP_SND; + serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = skb_shinfo(skb)->tskey; @@ -3530,6 +3530,14 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) -- cgit From 4ed2d765dfaccff5ebdac68e2064b59125033a3b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:49 -0400 Subject: net-timestamp: TCP timestamping TCP timestamping extends SO_TIMESTAMPING to bytestreams. Bytestreams do not have a 1:1 relationship between send() buffers and network packets. The feature interprets a send call on a bytestream as a request for a timestamp for the last byte in that send() buffer. The choice corresponds to a request for a timestamp when all bytes in the buffer have been sent. That assumption depends on in-order kernel transmission. This is the common case. That said, it is possible to construct a traffic shaping tree that would result in reordering. The guarantee is strong, then, but not ironclad. This implementation supports send and sendpages (splice). GSO replaces one large packet with multiple smaller packets. This patch also copies the option into the correct smaller packet. This patch does not yet support timestamping on data in an initial TCP Fast Open SYN, because that takes a very different data path. If ID generation in ee_data is enabled, bytestream timestamps return a byte offset, instead of the packet counter for datagrams. The implementation supports a single timestamp per packet. It silenty replaces requests for previous timestamps. To avoid missing tstamps, flush the tcp queue by disabling Nagle, cork and autocork. Missing tstamps can be detected by offset when the ee_data ID is enabled. Implementation details: - On GSO, the timestamping code can be included in the main loop. I moved it into its own loop to reduce the impact on the common case to a single branch. - To avoid leaking the absolute seqno to userspace, the offset returned in ee_data must always be relative. It is an offset between an skb and sk field. The first is always set (also for GSO & ACK). The second must also never be uninitialized. Only allow the ID option on sockets in the ESTABLISHED state, for which the seqno is available. Never reset it to zero (instead, move it to the current seqno when reenabling the option). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 ++++- net/core/sock.c | 13 +++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9705c0732aab..3dec0293a7c5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3522,8 +3522,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk->sk_protocol == IPPROTO_TCP) + serr->ee.ee_data -= sk->sk_tskey; + } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 1e0f1c63ad6b..2714811afbd8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -849,8 +849,17 @@ set_rcvbuf: break; } if (val & SOF_TIMESTAMPING_OPT_ID && - !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) - sk->sk_tskey = 0; + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_ESTABLISHED) { + ret = -EINVAL; + break; + } + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, -- cgit