From d215d10f2d6bd41ce9d11a2707568bbb50d2c32e Mon Sep 17 00:00:00 2001
From: "Peter Pan(潘卫平)" <panweiping3@gmail.com>
Date: Mon, 16 Jun 2014 21:57:22 +0800
Subject: net: delete duplicate dev_set_rx_mode() call

In __dev_open(), it already calls dev_set_rx_mode().
and dev_set_rx_mode() has no effect for a net device which does not have
IFF_UP flag set.

So the call of dev_set_rx_mode() is duplicate in __dev_change_flags().

Signed-off-by: Weiping Pan <panweiping3@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 30eedf677913..a04b12f31e18 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5432,13 +5432,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 	 */
 
 	ret = 0;
-	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
+	if ((old_flags ^ flags) & IFF_UP)
 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
-		if (!ret)
-			dev_set_rx_mode(dev);
-	}
-
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 		unsigned int old_flags = dev->flags;
-- 
cgit 


From e0f31d8498676fda36289603a054d0d490aa2679 Mon Sep 17 00:00:00 2001
From: Govindarajulu Varadarajan <_govind@gmx.com>
Date: Mon, 23 Jun 2014 16:07:58 +0530
Subject: flow_keys: Record IP layer protocol in skb_flow_dissect()

skb_flow_dissect() dissects only transport header type in ip_proto. It dose not
give any information about IPv4 or IPv6.

This patch adds new member, n_proto, to struct flow_keys. Which records the
IP layer type. i.e IPv4 or IPv6.

This can be used in netdev->ndo_rx_flow_steer driver function to dissect flow.

Adding new member to flow_keys increases the struct size by around 4 bytes.
This causes BUILD_BUG_ON(sizeof(qcb->data) < sz); to fail in
qdisc_cb_private_validate()

So increase data size by 4

Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12a5323..c2b53c1b21d2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -175,6 +175,7 @@ ipv6:
 		break;
 	}
 
+	flow->n_proto = proto;
 	flow->ip_proto = ip_proto;
 	flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
 	flow->thoff = (u16) nhoff;
-- 
cgit 


From f6d8cb2eeded7df18b821a321d4cd1cdd1754bf8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Jun 2014 05:32:48 -0700
Subject: inet: reduce TLB pressure for listeners

It seems overkill to use vmalloc() for typical listeners with less than
2048 hash buckets. Try kmalloc() and fallback to vmalloc() to reduce TLB
pressure.

Use kvfree() helper as it is now available.
Use ilog2() instead of a loop.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/request_sock.c | 43 ++++++++++++-------------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

(limited to 'net/core')

diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 467f326126e0..04db318e6218 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 		      unsigned int nr_table_entries)
 {
 	size_t lopt_size = sizeof(struct listen_sock);
-	struct listen_sock *lopt;
+	struct listen_sock *lopt = NULL;
 
 	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
 	nr_table_entries = max_t(u32, nr_table_entries, 8);
 	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
 	lopt_size += nr_table_entries * sizeof(struct request_sock *);
-	if (lopt_size > PAGE_SIZE)
+
+	if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+		lopt = kzalloc(lopt_size, GFP_KERNEL |
+					  __GFP_NOWARN |
+					  __GFP_NORETRY);
+	if (!lopt)
 		lopt = vzalloc(lopt_size);
-	else
-		lopt = kzalloc(lopt_size, GFP_KERNEL);
-	if (lopt == NULL)
+	if (!lopt)
 		return -ENOMEM;
 
-	for (lopt->max_qlen_log = 3;
-	     (1 << lopt->max_qlen_log) < nr_table_entries;
-	     lopt->max_qlen_log++);
-
 	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
 	rwlock_init(&queue->syn_wait_lock);
 	queue->rskq_accept_head = NULL;
 	lopt->nr_table_entries = nr_table_entries;
+	lopt->max_qlen_log = ilog2(nr_table_entries);
 
 	write_lock_bh(&queue->syn_wait_lock);
 	queue->listen_opt = lopt;
@@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 
 void __reqsk_queue_destroy(struct request_sock_queue *queue)
 {
-	struct listen_sock *lopt;
-	size_t lopt_size;
-
-	/*
-	 * this is an error recovery path only
-	 * no locking needed and the lopt is not NULL
-	 */
-
-	lopt = queue->listen_opt;
-	lopt_size = sizeof(struct listen_sock) +
-		lopt->nr_table_entries * sizeof(struct request_sock *);
-
-	if (lopt_size > PAGE_SIZE)
-		vfree(lopt);
-	else
-		kfree(lopt);
+	/* This is an error recovery path only, no locking needed */
+	kvfree(queue->listen_opt);
 }
 
 static inline struct listen_sock *reqsk_queue_yank_listen_sk(
@@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
 {
 	/* make all the listen_opt local to us */
 	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
-	size_t lopt_size = sizeof(struct listen_sock) +
-		lopt->nr_table_entries * sizeof(struct request_sock *);
 
 	if (lopt->qlen != 0) {
 		unsigned int i;
@@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
 	}
 
 	WARN_ON(lopt->qlen != 0);
-	if (lopt_size > PAGE_SIZE)
-		vfree(lopt);
-	else
-		kfree(lopt);
+	kvfree(lopt);
 }
 
 /*
-- 
cgit 


From 9bf2b8c280b5c02ca8a9e75263bf3ca998fed144 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Thu, 26 Jun 2014 15:56:31 +0800
Subject: net: fix some typos in comment

In commit 371121057607e3127e19b3fa094330181b5b031e("net:
QDISC_STATE_RUNNING dont need atomic bit ops") the
__QDISC_STATE_RUNNING is renamed to __QDISC___STATE_RUNNING,
but the old names existing in comment are not replaced with
the new name completely.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index a04b12f31e18..de9774119541 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2738,8 +2738,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
-	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
-	 * and dequeue packets faster.
+	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
+	 * often and dequeue packets faster.
 	 */
 	contended = qdisc_is_running(q);
 	if (unlikely(contended))
-- 
cgit 


From b0ab2fabb5b91da99c189db02e91ae10bc8355c5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Thu, 26 Jun 2014 09:58:25 +0200
Subject: rtnetlink: allow to register ops without ops->setup set

So far, it is assumed that ops->setup is filled up. But there might be
case that ops might make sense even without ->setup. In that case,
forbid to newlink and dellink.

This allows to register simple rtnl link ops containing only ->kind.
That allows consistent way of passing device kind (either device-kind or
slave-kind) to userspace.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       |  2 +-
 net/core/rtnetlink.c | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index de9774119541..6e2a2cd82321 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7091,7 +7091,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 	rtnl_lock_unregistering(net_list);
 	list_for_each_entry(net, net_list, exit_list) {
 		for_each_netdev_reverse(net, dev) {
-			if (dev->rtnl_link_ops)
+			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
 			else
 				unregister_netdevice_queue(dev, &dev_kill_list);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1063996f8317..27acaf7ff6d7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
 	if (rtnl_link_ops_get(ops->kind))
 		return -EEXIST;
 
-	if (!ops->dellink)
+	/* The check for setup is here because if ops
+	 * does not have that filled up, it is not possible
+	 * to use the ops for creating device. So do not
+	 * fill up dellink as well. That disables rtnl_dellink.
+	 */
+	if (ops->setup && !ops->dellink)
 		ops->dellink = unregister_netdevice_queue;
 
 	list_add_tail(&ops->list, &link_ops);
@@ -1777,7 +1782,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -ENODEV;
 
 	ops = dev->rtnl_link_ops;
-	if (!ops)
+	if (!ops || !ops->dellink)
 		return -EOPNOTSUPP;
 
 	ops->dellink(dev, &list_kill);
@@ -2038,6 +2043,9 @@ replay:
 			return -EOPNOTSUPP;
 		}
 
+		if (!ops->setup)
+			return -EOPNOTSUPP;
+
 		if (!ifname[0])
 			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
 
-- 
cgit 


From baac167b706600ebe7158acaeb7c489ae9d0bb8b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 26 Jun 2014 13:16:49 +0200
Subject: pktgen: avoid expensive set_current_state() call in loop

Avoid calling set_current_state() inside the busy-loop in
pktgen_thread_worker().  In case of pkt_dev->delay, then it is still
used/enabled in pktgen_xmit() via the spin() call.

The set_current_state(TASK_INTERRUPTIBLE) uses a xchg, which implicit
is LOCK prefixed.  I've measured the asm LOCK operation to take approx
8ns on this E5-2630 CPU.  Performance increase corrolate with this
measurement.

Performance data with CLONE_SKB==100000, rx-usecs=30:
 (single CPU performance, ixgbe 10Gbit/s, E5-2630)
 * Prev:  5454050 pps --> 183.35ns (1/5454050*10^9)
 * Now:   5684009 pps --> 175.93ns (1/5684009*10^9)
 * Diff:  +229959 pps -->  -7.42ns

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fc17a9d309ac..b61f553689b6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3407,10 +3407,10 @@ static int pktgen_thread_worker(void *arg)
 
 	pr_debug("starting pktgen/%d:  pid=%d\n", cpu, task_pid_nr(current));
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	set_freezable();
 
+	__set_current_state(TASK_RUNNING);
+
 	while (!kthread_should_stop()) {
 		pkt_dev = next_to_run(t);
 
@@ -3424,8 +3424,6 @@ static int pktgen_thread_worker(void *arg)
 			continue;
 		}
 
-		__set_current_state(TASK_RUNNING);
-
 		if (likely(pkt_dev)) {
 			pktgen_xmit(pkt_dev);
 
@@ -3456,9 +3454,8 @@ static int pktgen_thread_worker(void *arg)
 		}
 
 		try_to_freeze();
-
-		set_current_state(TASK_INTERRUPTIBLE);
 	}
+	set_current_state(TASK_INTERRUPTIBLE);
 
 	pr_debug("%s stopping all device\n", t->tsk->comm);
 	pktgen_stop(t);
-- 
cgit 


From 8788370a1d4b1d89efc1aea1b13ea5e5bfe10fde Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 26 Jun 2014 13:16:59 +0200
Subject: pktgen: RCU-ify "if_list" to remove lock in next_to_run()

The if_lock()/if_unlock() in next_to_run() adds a significant
overhead, because its called for every packet in busy loop of
pktgen_thread_worker().  (Thomas Graf originally pointed me
at this lock problem).

Removing these two "LOCK" operations should in theory save us approx
16ns (8ns x 2), as illustrated below we do save 16ns when removing
the locks and introducing RCU protection.

Performance data with CLONE_SKB==100000, TX-size=512, rx-usecs=30:
 (single CPU performance, ixgbe 10Gbit/s, E5-2630)
 * Prev   : 5684009 pps --> 175.93ns (1/5684009*10^9)
 * RCU-fix: 6272204 pps --> 159.43ns (1/6272204*10^9)
 * Diff   : +588195 pps --> -16.50ns

To understand this RCU patch, I describe the pktgen thread model
below.

In pktgen there is several kernel threads, but there is only one CPU
running each kernel thread.  Communication with the kernel threads are
done through some thread control flags.  This allow the thread to
change data structures at a know synchronization point, see main
thread func pktgen_thread_worker().

Userspace changes are communicated through proc-file writes.  There
are three types of changes, general control changes "pgctrl"
(func:pgctrl_write), thread changes "kpktgend_X"
(func:pktgen_thread_write), and interface config changes "etcX@N"
(func:pktgen_if_write).

Userspace "pgctrl" and "thread" changes are synchronized via the mutex
pktgen_thread_lock, thus only a single userspace instance can run.
The mutex is taken while the packet generator is running, by pgctrl
"start".  Thus e.g. "add_device" cannot be invoked when pktgen is
running/started.

All "pgctrl" and all "thread" changes, except thread "add_device",
communicate via the thread control flags.  The main problem is the
exception "add_device", that modifies threads "if_list" directly.

Fortunately "add_device" cannot be invoked while pktgen is running.
But there exists a race between "rem_device_all" and "add_device"
(which normally don't occur, because "rem_device_all" waits 125ms
before returning). Background'ing "rem_device_all" and running
"add_device" immediately allow the race to occur.

The race affects the threads (list of devices) "if_list".  The if_lock
is used for protecting this "if_list".  Other readers are given
lock-free access to the list under RCU read sections.

Note, interface config changes (via proc) can occur while pktgen is
running, which worries me a bit.  I'm assuming proc_remove() takes
appropriate locks, to assure no writers exists after proc_remove()
finish.

I've been running a script exercising the race condition (leading me
to fix the proc_remove order), without any issues.  The script also
exercises concurrent proc writes, while the interface config is
getting removed.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 101 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 52 insertions(+), 49 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b61f553689b6..5b05e364b8ae 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -69,8 +69,9 @@
  * for running devices in the if_list and sends packets until count is 0 it
  * also the thread checks the thread->control which is used for inter-process
  * communication. controlling process "posts" operations to the threads this
- * way. The if_lock should be possible to remove when add/rem_device is merged
- * into this too.
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
  *
  * By design there should only be *one* "controlling" process. In practice
  * multiple write accesses gives unpredictable result. Understood by "write"
@@ -208,7 +209,7 @@
 #define T_REMDEVALL   (1<<2)	/* Remove all devs */
 #define T_REMDEV      (1<<3)	/* Remove one dev */
 
-/* If lock -- can be removed after some work */
+/* If lock -- protects updating of if_list */
 #define   if_lock(t)           spin_lock(&(t->if_lock));
 #define   if_unlock(t)           spin_unlock(&(t->if_lock));
 
@@ -241,6 +242,7 @@ struct pktgen_dev {
 	struct proc_dir_entry *entry;	/* proc file */
 	struct pktgen_thread *pg_thread;/* the owner */
 	struct list_head list;		/* chaining in the thread's run-queue */
+	struct rcu_head	 rcu;		/* freed by RCU */
 
 	int running;		/* if false, the test will stop */
 
@@ -1737,14 +1739,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
 
 	seq_puts(seq, "Running: ");
 
-	if_lock(t);
-	list_for_each_entry(pkt_dev, &t->if_list, list)
+	rcu_read_lock();
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
 		if (pkt_dev->running)
 			seq_printf(seq, "%s ", pkt_dev->odevname);
 
 	seq_puts(seq, "\nStopped: ");
 
-	list_for_each_entry(pkt_dev, &t->if_list, list)
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
 		if (!pkt_dev->running)
 			seq_printf(seq, "%s ", pkt_dev->odevname);
 
@@ -1753,7 +1755,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
 	else
 		seq_puts(seq, "\nResult: NA\n");
 
-	if_unlock(t);
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -1878,10 +1880,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
 		pkt_dev = pktgen_find_dev(t, ifname, exact);
 		if (pkt_dev) {
 			if (remove) {
-				if_lock(t);
 				pkt_dev->removal_mark = 1;
 				t->control |= T_REMDEV;
-				if_unlock(t);
 			}
 			break;
 		}
@@ -1931,7 +1931,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
 	list_for_each_entry(t, &pn->pktgen_threads, th_list) {
 		struct pktgen_dev *pkt_dev;
 
-		list_for_each_entry(pkt_dev, &t->if_list, list) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
 			if (pkt_dev->odev != dev)
 				continue;
 
@@ -1946,6 +1947,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
 				       dev->name);
 			break;
 		}
+		rcu_read_unlock();
 	}
 }
 
@@ -2997,8 +2999,8 @@ static void pktgen_run(struct pktgen_thread *t)
 
 	func_enter();
 
-	if_lock(t);
-	list_for_each_entry(pkt_dev, &t->if_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
 
 		/*
 		 * setup odev and create initial packet.
@@ -3007,18 +3009,18 @@ static void pktgen_run(struct pktgen_thread *t)
 
 		if (pkt_dev->odev) {
 			pktgen_clear_counters(pkt_dev);
-			pkt_dev->running = 1;	/* Cranke yeself! */
 			pkt_dev->skb = NULL;
 			pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
 
 			set_pkt_overhead(pkt_dev);
 
 			strcpy(pkt_dev->result, "Starting");
+			pkt_dev->running = 1;	/* Cranke yeself! */
 			started++;
 		} else
 			strcpy(pkt_dev->result, "Error starting");
 	}
-	if_unlock(t);
+	rcu_read_unlock();
 	if (started)
 		t->control &= ~(T_STOP);
 }
@@ -3041,27 +3043,25 @@ static int thread_is_running(const struct pktgen_thread *t)
 {
 	const struct pktgen_dev *pkt_dev;
 
-	list_for_each_entry(pkt_dev, &t->if_list, list)
-		if (pkt_dev->running)
+	rcu_read_lock();
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+		if (pkt_dev->running) {
+			rcu_read_unlock();
 			return 1;
+		}
+	rcu_read_unlock();
 	return 0;
 }
 
 static int pktgen_wait_thread_run(struct pktgen_thread *t)
 {
-	if_lock(t);
-
 	while (thread_is_running(t)) {
 
-		if_unlock(t);
-
 		msleep_interruptible(100);
 
 		if (signal_pending(current))
 			goto signal;
-		if_lock(t);
 	}
-	if_unlock(t);
 	return 1;
 signal:
 	return 0;
@@ -3166,10 +3166,10 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
 		return -EINVAL;
 	}
 
+	pkt_dev->running = 0;
 	kfree_skb(pkt_dev->skb);
 	pkt_dev->skb = NULL;
 	pkt_dev->stopped_at = ktime_get();
-	pkt_dev->running = 0;
 
 	show_results(pkt_dev, nr_frags);
 
@@ -3180,9 +3180,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
 {
 	struct pktgen_dev *pkt_dev, *best = NULL;
 
-	if_lock(t);
-
-	list_for_each_entry(pkt_dev, &t->if_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
 		if (!pkt_dev->running)
 			continue;
 		if (best == NULL)
@@ -3190,7 +3189,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
 		else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
 			best = pkt_dev;
 	}
-	if_unlock(t);
+	rcu_read_unlock();
+
 	return best;
 }
 
@@ -3200,13 +3200,13 @@ static void pktgen_stop(struct pktgen_thread *t)
 
 	func_enter();
 
-	if_lock(t);
+	rcu_read_lock();
 
-	list_for_each_entry(pkt_dev, &t->if_list, list) {
+	list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
 		pktgen_stop_device(pkt_dev);
 	}
 
-	if_unlock(t);
+	rcu_read_unlock();
 }
 
 /*
@@ -3220,8 +3220,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
 
 	func_enter();
 
-	if_lock(t);
-
 	list_for_each_safe(q, n, &t->if_list) {
 		cur = list_entry(q, struct pktgen_dev, list);
 
@@ -3235,8 +3233,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
 
 		break;
 	}
-
-	if_unlock(t);
 }
 
 static void pktgen_rem_all_ifs(struct pktgen_thread *t)
@@ -3248,8 +3244,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
 
 	/* Remove all devices, free mem */
 
-	if_lock(t);
-
 	list_for_each_safe(q, n, &t->if_list) {
 		cur = list_entry(q, struct pktgen_dev, list);
 
@@ -3258,8 +3252,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
 
 		pktgen_remove_device(t, cur);
 	}
-
-	if_unlock(t);
 }
 
 static void pktgen_rem_thread(struct pktgen_thread *t)
@@ -3482,8 +3474,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
 	struct pktgen_dev *p, *pkt_dev = NULL;
 	size_t len = strlen(ifname);
 
-	if_lock(t);
-	list_for_each_entry(p, &t->if_list, list)
+	rcu_read_lock();
+	list_for_each_entry_rcu(p, &t->if_list, list)
 		if (strncmp(p->odevname, ifname, len) == 0) {
 			if (p->odevname[len]) {
 				if (exact || p->odevname[len] != '@')
@@ -3493,7 +3485,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
 			break;
 		}
 
-	if_unlock(t);
+	rcu_read_unlock();
 	pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
 	return pkt_dev;
 }
@@ -3507,6 +3499,12 @@ static int add_dev_to_thread(struct pktgen_thread *t,
 {
 	int rv = 0;
 
+	/* This function cannot be called concurrently, as its called
+	 * under pktgen_thread_lock mutex, but it can run from
+	 * userspace on another CPU than the kthread.  The if_lock()
+	 * is used here to sync with concurrent instances of
+	 * _rem_dev_from_if_list() invoked via kthread, which is also
+	 * updating the if_list */
 	if_lock(t);
 
 	if (pkt_dev->pg_thread) {
@@ -3515,9 +3513,9 @@ static int add_dev_to_thread(struct pktgen_thread *t,
 		goto out;
 	}
 
-	list_add(&pkt_dev->list, &t->if_list);
-	pkt_dev->pg_thread = t;
 	pkt_dev->running = 0;
+	pkt_dev->pg_thread = t;
+	list_add_rcu(&pkt_dev->list, &t->if_list);
 
 out:
 	if_unlock(t);
@@ -3672,11 +3670,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
 	struct list_head *q, *n;
 	struct pktgen_dev *p;
 
+	if_lock(t);
 	list_for_each_safe(q, n, &t->if_list) {
 		p = list_entry(q, struct pktgen_dev, list);
 		if (p == pkt_dev)
-			list_del(&p->list);
+			list_del_rcu(&p->list);
 	}
+	if_unlock(t);
 }
 
 static int pktgen_remove_device(struct pktgen_thread *t,
@@ -3696,20 +3696,22 @@ static int pktgen_remove_device(struct pktgen_thread *t,
 		pkt_dev->odev = NULL;
 	}
 
-	/* And update the thread if_list */
-
-	_rem_dev_from_if_list(t, pkt_dev);
-
+	/* Remove proc before if_list entry, because add_device uses
+	 * list to determine if interface already exist, avoid race
+	 * with proc_create_data() */
 	if (pkt_dev->entry)
 		proc_remove(pkt_dev->entry);
 
+	/* And update the thread if_list */
+	_rem_dev_from_if_list(t, pkt_dev);
+
 #ifdef CONFIG_XFRM
 	free_SAs(pkt_dev);
 #endif
 	vfree(pkt_dev->flows);
 	if (pkt_dev->page)
 		put_page(pkt_dev->page);
-	kfree(pkt_dev);
+	kfree_rcu(pkt_dev, rcu);
 	return 0;
 }
 
@@ -3809,6 +3811,7 @@ static void __exit pg_cleanup(void)
 {
 	unregister_netdevice_notifier(&pktgen_notifier_block);
 	unregister_pernet_subsys(&pg_net_ops);
+	/* Don't need rcu_barrier() due to use of kfree_rcu() */
 }
 
 module_init(pg_init);
-- 
cgit 


From b9c701edc7ff6ddd522497b0194b0bc3ec1b51a9 Mon Sep 17 00:00:00 2001
From: Stefan Sørensen <stefan.sorensen@spectralink.com>
Date: Fri, 27 Jun 2014 11:59:09 +0200
Subject: net: Simplify ptp class checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace two switch statements enumerating all valid ptp classes with an if
statement matching for not PTP_CLASS_NONE.

Signed-off-by: Stefan Sørensen <stefan.sorensen@spectralink.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/timestamping.c | 57 +++++++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 37 deletions(-)

(limited to 'net/core')

diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 6521dfd8b7c8..a8770391ea5b 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -43,31 +43,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
 		return;
 
 	type = classify(skb);
+	if (type == PTP_CLASS_NONE)
+		return;
+
+	phydev = skb->dev->phydev;
+	if (likely(phydev->drv->txtstamp)) {
+		if (!atomic_inc_not_zero(&sk->sk_refcnt))
+			return;
 
-	switch (type) {
-	case PTP_CLASS_V1_IPV4:
-	case PTP_CLASS_V1_IPV6:
-	case PTP_CLASS_V2_IPV4:
-	case PTP_CLASS_V2_IPV6:
-	case PTP_CLASS_V2_L2:
-	case PTP_CLASS_V2_VLAN:
-		phydev = skb->dev->phydev;
-		if (likely(phydev->drv->txtstamp)) {
-			if (!atomic_inc_not_zero(&sk->sk_refcnt))
-				return;
-
-			clone = skb_clone(skb, GFP_ATOMIC);
-			if (!clone) {
-				sock_put(sk);
-				return;
-			}
-
-			clone->sk = sk;
-			phydev->drv->txtstamp(phydev, clone, type);
+		clone = skb_clone(skb, GFP_ATOMIC);
+		if (!clone) {
+			sock_put(sk);
+			return;
 		}
-		break;
-	default:
-		break;
+
+		clone->sk = sk;
+		phydev->drv->txtstamp(phydev, clone, type);
 	}
 }
 EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
@@ -114,20 +105,12 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
 
 	__skb_pull(skb, ETH_HLEN);
 
-	switch (type) {
-	case PTP_CLASS_V1_IPV4:
-	case PTP_CLASS_V1_IPV6:
-	case PTP_CLASS_V2_IPV4:
-	case PTP_CLASS_V2_IPV6:
-	case PTP_CLASS_V2_L2:
-	case PTP_CLASS_V2_VLAN:
-		phydev = skb->dev->phydev;
-		if (likely(phydev->drv->rxtstamp))
-			return phydev->drv->rxtstamp(phydev, skb, type);
-		break;
-	default:
-		break;
-	}
+	if (type == PTP_CLASS_NONE)
+		return false;
+
+	phydev = skb->dev->phydev;
+	if (likely(phydev->drv->rxtstamp))
+		return phydev->drv->rxtstamp(phydev, skb, type);
 
 	return false;
 }
-- 
cgit 


From ae5c6c6d7bcadfbedefb5fc8ff0ebe2bfa83a0a1 Mon Sep 17 00:00:00 2001
From: Stefan Sørensen <stefan.sorensen@spectralink.com>
Date: Fri, 27 Jun 2014 11:59:10 +0200
Subject: ptp: Classify ptp over ip over vlan packets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This extends the ptp bpf to also match ptp over ip over vlan packets. The ptp
classes are changed to orthogonal bitfields representing version, transport
and vlan values to simplify matching.

Signed-off-by: Stefan Sørensen <stefan.sorensen@spectralink.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ptp_classifier.c | 64 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index d3027a73fd4b..12ab7b4be609 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -52,14 +52,43 @@
  * test_8021q:
  *   jneq #0x8100, test_ieee1588   ; ETH_P_8021Q ?
  *   ldh [16]                      ; load inner type
- *   jneq #0x88f7, drop_ieee1588   ; ETH_P_1588 ?
+ *   jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
  *   ldb [18]                      ; load payload
  *   and #0x8                      ; as we don't have ports here, test
  *   jneq #0x0, drop_ieee1588      ; for PTP_GEN_BIT and drop these
  *   ldh [18]                      ; reload payload
  *   and #0xf                      ; mask PTP_CLASS_VMASK
- *   or #0x40                      ; PTP_CLASS_V2_VLAN
+ *   or #0x70                      ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ *   ret a                         ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ *   jneq #0x800, test_8021q_ipv6  ; ETH_P_IP ?
+ *   ldb [27]                      ; load proto
+ *   jneq #17, drop_8021q_ipv4     ; IPPROTO_UDP ?
+ *   ldh [24]                      ; load frag offset field
+ *   jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ *   ldxb 4*([18]&0xf)             ; load IP header len
+ *   ldh [x + 20]                  ; load UDP dst port
+ *   jneq #319, drop_8021q_ipv4    ; is port PTP_EV_PORT ?
+ *   ldh [x + 26]                  ; load payload
+ *   and #0xf                      ; mask PTP_CLASS_VMASK
+ *   or #0x50                      ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ *   ret a                         ; return PTP class
+ *   drop_8021q_ipv4: ret #0x0     ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ *   jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ *   ldb [24]                      ; load proto
+ *   jneq #17, drop_8021q_ipv6           ; IPPROTO_UDP ?
+ *   ldh [60]                      ; load UDP dst port
+ *   jneq #319, drop_8021q_ipv6          ; is port PTP_EV_PORT ?
+ *   ldh [66]                      ; load payload
+ *   and #0xf                      ; mask PTP_CLASS_VMASK
+ *   or #0x60                      ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
  *   ret a                         ; return PTP class
+ *   drop_8021q_ipv6: ret #0x0     ; PTP_CLASS_NONE
  *
  * ; PTP over Ethernet
  * test_ieee1588:
@@ -113,16 +142,39 @@ void __init ptp_classifier_init(void)
 		{ 0x44,  0,  0, 0x00000020 },
 		{ 0x16,  0,  0, 0x00000000 },
 		{ 0x06,  0,  0, 0x00000000 },
-		{ 0x15,  0,  9, 0x00008100 },
+		{ 0x15,  0, 32, 0x00008100 },
 		{ 0x28,  0,  0, 0x00000010 },
-		{ 0x15,  0, 15, 0x000088f7 },
+		{ 0x15,  0,  7, 0x000088f7 },
 		{ 0x30,  0,  0, 0x00000012 },
 		{ 0x54,  0,  0, 0x00000008 },
-		{ 0x15,  0, 12, 0x00000000 },
+		{ 0x15,  0, 35, 0x00000000 },
 		{ 0x28,  0,  0, 0x00000012 },
 		{ 0x54,  0,  0, 0x0000000f },
-		{ 0x44,  0,  0, 0x00000040 },
+		{ 0x44,  0,  0, 0x00000070 },
+		{ 0x16,  0,  0, 0x00000000 },
+		{ 0x15,  0, 12, 0x00000800 },
+		{ 0x30,  0,  0, 0x0000001b },
+		{ 0x15,  0,  9, 0x00000011 },
+		{ 0x28,  0,  0, 0x00000018 },
+		{ 0x45,  7,  0, 0x00001fff },
+		{ 0xb1,  0,  0, 0x00000012 },
+		{ 0x48,  0,  0, 0x00000014 },
+		{ 0x15,  0,  4, 0x0000013f },
+		{ 0x48,  0,  0, 0x0000001a },
+		{ 0x54,  0,  0, 0x0000000f },
+		{ 0x44,  0,  0, 0x00000050 },
+		{ 0x16,  0,  0, 0x00000000 },
+		{ 0x06,  0,  0, 0x00000000 },
+		{ 0x15,  0,  8, 0x000086dd },
+		{ 0x30,  0,  0, 0x00000018 },
+		{ 0x15,  0,  6, 0x00000011 },
+		{ 0x28,  0,  0, 0x0000003c },
+		{ 0x15,  0,  4, 0x0000013f },
+		{ 0x28,  0,  0, 0x00000042 },
+		{ 0x54,  0,  0, 0x0000000f },
+		{ 0x44,  0,  0, 0x00000060 },
 		{ 0x16,  0,  0, 0x00000000 },
+		{ 0x06,  0,  0, 0x00000000 },
 		{ 0x15,  0,  7, 0x000088f7 },
 		{ 0x30,  0,  0, 0x0000000e },
 		{ 0x54,  0,  0, 0x00000008 },
-- 
cgit 


From 5ed20a68cd6ca4adc0aa2d240913d604a2eb3e25 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 1 Jul 2014 21:32:05 -0700
Subject: flow_dissector: Abstract out hash computation

Move the hash computation located in __skb_get_hash to be a separate
function which takes flow_keys as input. This will allow flow hash
computation in other contexts where we only have addresses and ports.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c2b53c1b21d2..2ff8cd4dfc5f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -202,6 +202,33 @@ static __always_inline u32 __flow_hash_1word(u32 a)
 	return jhash_1word(a, hashrnd);
 }
 
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+{
+	u32 hash;
+
+	/* get a consistent hash (same value on both flow directions) */
+	if (((__force u32)keys->dst < (__force u32)keys->src) ||
+	    (((__force u32)keys->dst == (__force u32)keys->src) &&
+	     ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
+		swap(keys->dst, keys->src);
+		swap(keys->port16[0], keys->port16[1]);
+	}
+
+	hash = __flow_hash_3words((__force u32)keys->dst,
+				  (__force u32)keys->src,
+				  (__force u32)keys->ports);
+	if (!hash)
+		hash = 1;
+
+	return hash;
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+	return __flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
 /*
  * __skb_get_hash: calculate a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
@@ -211,7 +238,6 @@ static __always_inline u32 __flow_hash_1word(u32 a)
 void __skb_get_hash(struct sk_buff *skb)
 {
 	struct flow_keys keys;
-	u32 hash;
 
 	if (!skb_flow_dissect(skb, &keys))
 		return;
@@ -219,21 +245,7 @@ void __skb_get_hash(struct sk_buff *skb)
 	if (keys.ports)
 		skb->l4_hash = 1;
 
-	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys.dst < (__force u32)keys.src) ||
-	    (((__force u32)keys.dst == (__force u32)keys.src) &&
-	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
-		swap(keys.dst, keys.src);
-		swap(keys.port16[0], keys.port16[1]);
-	}
-
-	hash = __flow_hash_3words((__force u32)keys.dst,
-				  (__force u32)keys.src,
-				  (__force u32)keys.ports);
-	if (!hash)
-		hash = 1;
-
-	skb->hash = hash;
+	skb->hash = __flow_hash_from_keys(&keys);
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
-- 
cgit 


From 0e001614e849b68cff94cda8db8b550569d3dba6 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 1 Jul 2014 21:32:27 -0700
Subject: net: Call skb_get_hash in get_xps_queue and __skb_tx_hash

Call standard function to get a packet hash instead of taking this from
skb->sk->sk_hash or only using skb->protocol.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2ff8cd4dfc5f..62d1cb624f53 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -196,12 +196,6 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
 	return jhash_3words(a, b, c, hashrnd);
 }
 
-static __always_inline u32 __flow_hash_1word(u32 a)
-{
-	__flow_hash_secret_init();
-	return jhash_1word(a, hashrnd);
-}
-
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 {
 	u32 hash;
@@ -253,7 +247,7 @@ EXPORT_SYMBOL(__skb_get_hash);
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
  */
-u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 		  unsigned int num_tx_queues)
 {
 	u32 hash;
@@ -273,13 +267,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		qcount = dev->tc_to_txq[tc].count;
 	}
 
-	if (skb->sk && skb->sk->sk_hash)
-		hash = skb->sk->sk_hash;
-	else
-		hash = (__force u16) skb->protocol;
-	hash = __flow_hash_1word(hash);
-
-	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+	return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset;
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
@@ -351,17 +339,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
-			else {
-				u32 hash;
-				if (skb->sk && skb->sk->sk_hash)
-					hash = skb->sk->sk_hash;
-				else
-					hash = (__force u16) skb->protocol ^
-					    skb->hash;
-				hash = __flow_hash_1word(hash);
+			else
 				queue_index = map->queues[
-				    ((u64)hash * map->len) >> 32];
-			}
+				    ((u64)skb_get_hash(skb) * map->len) >> 32];
+
 			if (unlikely(queue_index >= dev->real_num_tx_queues))
 				queue_index = -1;
 		}
-- 
cgit 


From 19469a873bafd4e65daef3597db2bd724c1b03c9 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 1 Jul 2014 21:33:01 -0700
Subject: flow_dissector: Use IPv6 flow label in flow_dissector

This patch implements the receive side to support RFC 6438 which is to
use the flow label as an ECMP hash. If an IPv6 flow label is set
in a packet we can use this as input for computing an L4-hash. There
should be no need to parse any transport headers in this case.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 62d1cb624f53..c5f3912dad4c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -80,6 +80,8 @@ ip:
 	case htons(ETH_P_IPV6): {
 		const struct ipv6hdr *iph;
 		struct ipv6hdr _iph;
+		__be32 flow_label;
+
 ipv6:
 		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
 		if (!iph)
@@ -89,6 +91,21 @@ ipv6:
 		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
 		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 		nhoff += sizeof(struct ipv6hdr);
+
+		flow_label = ip6_flowlabel(iph);
+		if (flow_label) {
+			/* Awesome, IPv6 packet has a flow label so we can
+			 * use that to represent the ports without any
+			 * further dissection.
+			 */
+			flow->n_proto = proto;
+			flow->ip_proto = ip_proto;
+			flow->ports = flow_label;
+			flow->thoff = (u16)nhoff;
+
+			return true;
+		}
+
 		break;
 	}
 	case htons(ETH_P_8021AD):
-- 
cgit 


From a3b18ddb9cc1056eea24e3edc1828cfb3fd0726f Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 1 Jul 2014 21:33:17 -0700
Subject: net: Only do flow_dissector hash computation once per packet

Add sw_hash flag to skbuff to indicate that skb->hash was computed
from flow_dissector. This flag is checked in skb_get_hash to avoid
repeatedly trying to compute the hash (ie. in the case that no L4 hash
can be computed).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c5f3912dad4c..5f362c1d0332 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -256,6 +256,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	if (keys.ports)
 		skb->l4_hash = 1;
 
+	skb->sw_hash = 1;
+
 	skb->hash = __flow_hash_from_keys(&keys);
 }
 EXPORT_SYMBOL(__skb_get_hash);
-- 
cgit 


From 9f12fbe603f7ae346b2b46008e325f0c9a68e55d Mon Sep 17 00:00:00 2001
From: Zi Shen Lim <zlim.lnx@gmail.com>
Date: Thu, 3 Jul 2014 07:56:54 -0700
Subject: net: filter: move load_pointer() into filter.h

load_pointer() is already a static inline function.
Let's move it into filter.h so BPF JIT implementations can reuse this
function.

Since we're exporting this function, let's also rename it to
bpf_load_pointer() for clarity.

Signed-off-by: Zi Shen Lim <zlim.lnx@gmail.com>
Reviewed-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 1dbf6462f766..87af1e3e56c0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -84,15 +84,6 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 	return NULL;
 }
 
-static inline void *load_pointer(const struct sk_buff *skb, int k,
-				 unsigned int size, void *buffer)
-{
-	if (k >= 0)
-		return skb_header_pointer(skb, k, size, buffer);
-
-	return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
 /**
  *	sk_filter - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
@@ -537,7 +528,7 @@ load_word:
 		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
 		 */
 
-		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = get_unaligned_be32(ptr);
 			CONT;
@@ -547,7 +538,7 @@ load_word:
 	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
 		off = IMM;
 load_half:
-		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = get_unaligned_be16(ptr);
 			CONT;
@@ -557,7 +548,7 @@ load_half:
 	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
 		off = IMM;
 load_byte:
-		ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
 		if (likely(ptr != NULL)) {
 			BPF_R0 = *(u8 *)ptr;
 			CONT;
-- 
cgit 


From efa95b01da18ad22af62f6d99a3243f3be8fd264 Mon Sep 17 00:00:00 2001
From: david decotigny <decot@googlers.com>
Date: Tue, 8 Jul 2014 15:14:41 -0700
Subject: netpoll: fix use after free

After a bonding master reclaims the netpoll info struct, slaves could
still hold a pointer to the reclaimed data. This patch fixes it: as
soon as netpoll_async_cleanup is called for a slave (eg. when
un-enslaved), we make sure that this slave doesn't point to the data.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e33937fb32a0..907fb5e36c02 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -822,7 +822,8 @@ void __netpoll_cleanup(struct netpoll *np)
 
 		RCU_INIT_POINTER(np->dev->npinfo, NULL);
 		call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
-	}
+	} else
+		RCU_INIT_POINTER(np->dev->npinfo, NULL);
 }
 EXPORT_SYMBOL_GPL(__netpoll_cleanup);
 
-- 
cgit 


From 5d5eacb34c9e1fdc0a47b885d832eaa4de860dc7 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Thu, 10 Jul 2014 07:01:58 -0400
Subject: bridge: fdb dumping takes a filter device

Dumping a bridge fdb dumps every fdb entry
held. With this change we are going to filter
on selected bridge port.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 27acaf7ff6d7..90a906e7ac26 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2517,6 +2517,7 @@ skip:
 int ndo_dflt_fdb_dump(struct sk_buff *skb,
 		      struct netlink_callback *cb,
 		      struct net_device *dev,
+		      struct net_device *filter_dev,
 		      int idx)
 {
 	int err;
@@ -2547,13 +2548,15 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			br_dev = netdev_master_upper_dev_get(dev);
 			ops = br_dev->netdev_ops;
 			if (ops->ndo_fdb_dump)
-				idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
+				idx = ops->ndo_fdb_dump(skb, cb, dev, NULL,
+							idx);
 		}
 
 		if (dev->netdev_ops->ndo_fdb_dump)
-			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
+			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+							    idx);
 		else
-			idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
+			idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
 	}
 	rcu_read_unlock();
 
-- 
cgit 


From 5e6d243587990a588143b9da3974833649595587 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Thu, 10 Jul 2014 07:01:59 -0400
Subject: bridge: netlink dump interface at par with brctl

Actually better than brctl showmacs because we can filter by bridge
port in the kernel.
The current bridge netlink interface doesnt scale when you have many
bridges each with large fdbs or even bridges with many bridge ports

And now for the science non-fiction novel you have all been
waiting for..

//lets see what bridge ports we have
root@moja-1:/configs/may30-iprt/bridge# ./bridge link show
8: eth1 state DOWN : <BROADCAST,MULTICAST> mtu 1500 master br0 state
disabled priority 32 cost 19
17: sw1-p1 state DOWN : <BROADCAST,NOARP> mtu 1500 master br0 state
disabled priority 32 cost 100

// show all..
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show
33:33:00:00:00:01 dev bond0 self permanent
33:33:00:00:00:01 dev dummy0 self permanent
33:33:00:00:00:01 dev ifb0 self permanent
33:33:00:00:00:01 dev ifb1 self permanent
33:33:00:00:00:01 dev eth0 self permanent
01:00:5e:00:00:01 dev eth0 self permanent
33:33:ff:22:01:01 dev eth0 self permanent
02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:07 dev eth1 self permanent
33:33:00:00:00:01 dev eth1 self permanent
33:33:00:00:00:01 dev gretap0 self permanent
da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent
33:33:00:00:00:01 dev sw1-p1 self permanent

//filter by bridge
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0
02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:07 dev eth1 self permanent
33:33:00:00:00:01 dev eth1 self permanent
da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent
33:33:00:00:00:01 dev sw1-p1 self permanent

// bridge sw1 has no ports attached..
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br sw1

//filter by port
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show brport eth1
02:00:00:12:01:02 vlan 0 master br0 permanent
00:17:42:8a:b4:05 vlan 0 master br0 permanent
00:17:42:8a:b4:07 self permanent
33:33:00:00:00:01 self permanent

// filter by port + bridge
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0 brport
sw1-p1
da:ac:46:27:d9:53 vlan 0 master br0 permanent
33:33:00:00:00:01 self permanent

// for shits and giggles (as they say in New Brunswick), lets
// change the mac that br0 uses
// Note: a magical fdb entry with no brport is added ...
root@moja-1:/configs/may30-iprt/bridge# ip link set dev br0 address
02:00:00:12:01:04

// lets see if we can see the unicorn ..
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show
33:33:00:00:00:01 dev bond0 self permanent
33:33:00:00:00:01 dev dummy0 self permanent
33:33:00:00:00:01 dev ifb0 self permanent
33:33:00:00:00:01 dev ifb1 self permanent
33:33:00:00:00:01 dev eth0 self permanent
01:00:5e:00:00:01 dev eth0 self permanent
33:33:ff:22:01:01 dev eth0 self permanent
02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:07 dev eth1 self permanent
33:33:00:00:00:01 dev eth1 self permanent
33:33:00:00:00:01 dev gretap0 self permanent
02:00:00:12:01:04 dev br0 vlan 0 master br0 permanent <=== there it is
da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent
33:33:00:00:00:01 dev sw1-p1 self permanent

//can we see it if we filter by bridge?
root@moja-1:/configs/may30-iprt/bridge# ./bridge fdb show br br0
02:00:00:12:01:02 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:05 dev eth1 vlan 0 master br0 permanent
00:17:42:8a:b4:07 dev eth1 self permanent
33:33:00:00:00:01 dev eth1 self permanent
02:00:00:12:01:04 dev br0 vlan 0 master br0 permanent <=== there it is
da:ac:46:27:d9:53 dev sw1-p1 vlan 0 master br0 permanent
33:33:00:00:00:01 dev sw1-p1 self permanent

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 74 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 90a906e7ac26..1f8a59e02c48 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2535,30 +2535,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump);
 
 static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int idx = 0;
-	struct net *net = sock_net(skb->sk);
 	struct net_device *dev;
+	struct nlattr *tb[IFLA_MAX+1];
+	struct net_device *bdev = NULL;
+	struct net_device *br_dev = NULL;
+	const struct net_device_ops *ops = NULL;
+	const struct net_device_ops *cops = NULL;
+	struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
+	struct net *net = sock_net(skb->sk);
+	int brport_idx = 0;
+	int br_idx = 0;
+	int idx = 0;
+
+	if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
+			ifla_policy) == 0) {
+		if (tb[IFLA_MASTER])
+			br_idx = nla_get_u32(tb[IFLA_MASTER]);
+	}
+
+	brport_idx = ifm->ifi_index;
+
+	if (br_idx) {
+		br_dev = __dev_get_by_index(net, br_idx);
+		if (!br_dev)
+			return -ENODEV;
+
+		ops = br_dev->netdev_ops;
+		bdev = br_dev;
+	}
+
+	for_each_netdev(net, dev) {
+		if (brport_idx && (dev->ifindex != brport_idx))
+			continue;
+
+		if (!br_idx) { /* user did not specify a specific bridge */
+			if (dev->priv_flags & IFF_BRIDGE_PORT) {
+				br_dev = netdev_master_upper_dev_get(dev);
+				cops = br_dev->netdev_ops;
+			}
+
+			bdev = dev;
+		} else {
+			if (dev != br_dev &&
+			    !(dev->priv_flags & IFF_BRIDGE_PORT))
+				continue;
+
+			if (br_dev != netdev_master_upper_dev_get(dev) &&
+			    !(dev->priv_flags & IFF_EBRIDGE))
+				continue;
+
+			bdev = br_dev;
+			cops = ops;
+		}
 
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
 		if (dev->priv_flags & IFF_BRIDGE_PORT) {
-			struct net_device *br_dev;
-			const struct net_device_ops *ops;
-
-			br_dev = netdev_master_upper_dev_get(dev);
-			ops = br_dev->netdev_ops;
-			if (ops->ndo_fdb_dump)
-				idx = ops->ndo_fdb_dump(skb, cb, dev, NULL,
-							idx);
+			if (cops && cops->ndo_fdb_dump)
+				idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+							 idx);
 		}
 
+		idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
 		if (dev->netdev_ops->ndo_fdb_dump)
-			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev,
 							    idx);
-		else
-			idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+
+		cops = NULL;
 	}
-	rcu_read_unlock();
 
 	cb->args[0] = idx;
 	return skb->len;
-- 
cgit 


From ec31a05c4dfa95149b1754d9de92831a5a95c636 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 12 Jul 2014 15:49:16 +0200
Subject: net: filter: sk_chk_filter() no longer mangles filter

Add const attribute to filter argument to make clear it is no
longer modified.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 87af1e3e56c0..b90ae7fb3b89 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1085,7 +1085,7 @@ err:
  * a cell if not previously written, and we check all branches to be sure
  * a malicious user doesn't try to abuse us.
  */
-static int check_load_and_stores(struct sock_filter *filter, int flen)
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
 {
 	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
 	int pc, ret = 0;
@@ -1218,7 +1218,7 @@ static bool chk_code_allowed(u16 code_to_probe)
  *
  * Returns 0 if the rule set is legal or -EINVAL if not.
  */
-int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+int sk_chk_filter(const struct sock_filter *filter, unsigned int flen)
 {
 	bool anc_found;
 	int pc;
@@ -1228,7 +1228,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 
 	/* Check the filter code now */
 	for (pc = 0; pc < flen; pc++) {
-		struct sock_filter *ftest = &filter[pc];
+		const struct sock_filter *ftest = &filter[pc];
 
 		/* May we actually operate on this code? */
 		if (!chk_code_allowed(ftest->code))
-- 
cgit 


From 685343fc3ba61a1f6eef361b786601123db16c28 Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Mon, 14 Jul 2014 16:37:22 +0200
Subject: net: add name_assign_type netdev attribute

Based on a patch by David Herrmann.

The name_assign_type attribute gives hints where the interface name of a
given net-device comes from. These values are currently defined:
  NET_NAME_ENUM:
    The ifname is provided by the kernel with an enumerated
    suffix, typically based on order of discovery. Names may
    be reused and unpredictable.
  NET_NAME_PREDICTABLE:
    The ifname has been assigned by the kernel in a predictable way
    that is guaranteed to avoid reuse and always be the same for a
    given device. Examples include statically created devices like
    the loopback device and names deduced from hardware properties
    (including being given explicitly by the firmware). Names
    depending on the order of discovery, or in any other way on the
    existence of other devices, must not be marked as PREDICTABLE.
  NET_NAME_USER:
    The ifname was provided by user-space during net-device setup.
  NET_NAME_RENAMED:
    The net-device has been renamed from userspace. Once this type is set,
    it cannot change again.
  NET_NAME_UNKNOWN:
    This is an internal placeholder to indicate that we yet haven't yet
    categorized the name. It will not be exposed to userspace, rather
    -EINVAL is returned.

The aim of these patches is to improve user-space renaming of interfaces. As
a general rule, userspace must rename interfaces to guarantee that names stay
the same every time a given piece of hardware appears (at boot, or when
attaching it). However, there are several situations where userspace should
not perform the renaming, and that depends on both the policy of the local
admin, but crucially also on the nature of the current interface name.

If an interface was created in repsonse to a userspace request, and userspace
already provided a name, we most probably want to leave that name alone. The
main instance of this is wifi-P2P devices created over nl80211, which currently
have a long-standing bug where they are getting renamed by udev. We label such
names NET_NAME_USER.

If an interface, unbeknown to us, has already been renamed from userspace, we
most probably want to leave also that alone. This will typically happen when
third-party plugins (for instance to udev, but the interface is generic so could
be from anywhere) renames the interface without informing udev about it. A
typical situation is when you switch root from an installer or an initrd to the
real system and the new instance of udev does not know what happened before
the switch. These types of problems have caused repeated issues in the past. To
solve this, once an interface has been renamed, its name is labelled
NET_NAME_RENAMED.

In many cases, the kernel is actually able to name interfaces in such a
way that there is no need for userspace to rename them. This is the case when
the enumeration order of devices, or in fact any other (non-parent) device on
the system, can not influence the name of the interface. Examples include
statically created devices, or any naming schemes based on hardware properties
of the interface. In this case the admin may prefer to use the kernel-provided
names, and to make that possible we label such names NET_NAME_PREDICTABLE.
We want the kernel to have tho possibilty of performing predictable interface
naming itself (and exposing to userspace that it has), as the information
necessary for a proper naming scheme for a certain class of devices may not
be exposed to userspace.

The case where renaming is almost certainly desired, is when the kernel has
given the interface a name using global device enumeration based on order of
discovery (ethX, wlanY, etc). These naming schemes are labelled NET_NAME_ENUM.

Lastly, a fallback is left as NET_NAME_UNKNOWN, to indicate that a driver has
not yet been ported. This is mostly useful as a transitionary measure, allowing
us to label the various naming schemes bit by bit.

v8: minor documentation fixes
v9: move comment to the right commit

Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Reviewed-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1cac29ebb05b..7752f2ad49a5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -112,6 +112,25 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec);
 NETDEVICE_SHOW_RO(type, fmt_dec);
 NETDEVICE_SHOW_RO(link_mode, fmt_dec);
 
+static ssize_t format_name_assign_type(const struct net_device *net, char *buf)
+{
+	return sprintf(buf, fmt_dec, net->name_assign_type);
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	if (net->name_assign_type != NET_NAME_UNKNOWN)
+		ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(name_assign_type);
+
 /* use same locking rules as GIFHWADDR ioctl's */
 static ssize_t address_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
@@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = {
 	&dev_attr_dev_port.attr,
 	&dev_attr_iflink.attr,
 	&dev_attr_ifindex.attr,
+	&dev_attr_name_assign_type.attr,
 	&dev_attr_addr_assign_type.attr,
 	&dev_attr_addr_len.attr,
 	&dev_attr_link_mode.attr,
-- 
cgit 


From 238fa3623a5709d29673ed78ff8e714d040fbb89 Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Mon, 14 Jul 2014 16:37:23 +0200
Subject: net: set name assign type for renamed devices

Based on a patch from David Herrmann.

This is the only place devices can be renamed.

v9: restore revers-christmas-tree order of local variables

Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6e2a2cd82321..38793fb84a35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1082,6 +1082,7 @@ static int dev_get_valid_name(struct net *net,
  */
 int dev_change_name(struct net_device *dev, const char *newname)
 {
+	unsigned char old_assign_type;
 	char oldname[IFNAMSIZ];
 	int err = 0;
 	int ret;
@@ -1109,10 +1110,14 @@ int dev_change_name(struct net_device *dev, const char *newname)
 		return err;
 	}
 
+	old_assign_type = dev->name_assign_type;
+	dev->name_assign_type = NET_NAME_RENAMED;
+
 rollback:
 	ret = device_rename(&dev->dev, dev->name);
 	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
+		dev->name_assign_type = old_assign_type;
 		write_seqcount_end(&devnet_rename_seq);
 		return ret;
 	}
@@ -1141,6 +1146,8 @@ rollback:
 			write_seqcount_begin(&devnet_rename_seq);
 			memcpy(dev->name, oldname, IFNAMSIZ);
 			memcpy(oldname, newname, IFNAMSIZ);
+			dev->name_assign_type = old_assign_type;
+			old_assign_type = NET_NAME_RENAMED;
 			goto rollback;
 		} else {
 			pr_err("%s: name change rollback failed: %d\n",
-- 
cgit 


From c835a677331495cf137a7f8a023463afd9f032f8 Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Mon, 14 Jul 2014 16:37:24 +0200
Subject: net: set name_assign_type in alloc_netdev()

Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.

Coccinelle patch:

@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@

(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)

v9: move comments here from the wrong commit

Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 13 ++++++++-----
 net/core/rtnetlink.c |  4 ++--
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 38793fb84a35..2c98f10ee62a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6441,17 +6441,19 @@ void netdev_freemem(struct net_device *dev)
 
 /**
  *	alloc_netdev_mqs - allocate network device
- *	@sizeof_priv:	size of private data to allocate space for
- *	@name:		device name format string
- *	@setup:		callback to initialize device
- *	@txqs:		the number of TX subqueues to allocate
- *	@rxqs:		the number of RX subqueues to allocate
+ *	@sizeof_priv:		size of private data to allocate space for
+ *	@name:			device name format string
+ *	@name_assign_type: 	origin of device name
+ *	@setup:			callback to initialize device
+ *	@txqs:			the number of TX subqueues to allocate
+ *	@rxqs:			the number of RX subqueues to allocate
  *
  *	Allocates a struct net_device with private data area for driver use
  *	and performs basic initialization.  Also allocates subqueue structs
  *	for each queue on the device.
  */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+		unsigned char name_assign_type,
 		void (*setup)(struct net_device *),
 		unsigned int txqs, unsigned int rxqs)
 {
@@ -6530,6 +6532,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #endif
 
 	strcpy(dev->name, name);
+	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1f8a59e02c48..599864322de8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1828,8 +1828,8 @@ struct net_device *rtnl_create_link(struct net *net,
 		num_rx_queues = ops->get_num_rx_queues();
 
 	err = -ENOMEM;
-	dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
-			       num_tx_queues, num_rx_queues);
+	dev = alloc_netdev_mqs(ops->priv_size, ifname, NET_NAME_UNKNOWN,
+			       ops->setup, num_tx_queues, num_rx_queues);
 	if (!dev)
 		goto err;
 
-- 
cgit 


From 5517750f058edd111bcabe5e116056cc63b1f39c Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Mon, 14 Jul 2014 16:37:25 +0200
Subject: net: rtnetlink - make create_link take name_assign_type

This passes down NET_NAME_USER (or NET_NAME_ENUM) to alloc_netdev(),
for any device created over rtnetlink.

v9: restore reverse-christmas-tree order of local variables

Signed-off-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 599864322de8..e9918020dbc9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1810,7 +1810,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 EXPORT_SYMBOL(rtnl_configure_link);
 
 struct net_device *rtnl_create_link(struct net *net,
-	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
+	char *ifname, unsigned char name_assign_type,
+	const struct rtnl_link_ops *ops, struct nlattr *tb[])
 {
 	int err;
 	struct net_device *dev;
@@ -1828,7 +1829,7 @@ struct net_device *rtnl_create_link(struct net *net,
 		num_rx_queues = ops->get_num_rx_queues();
 
 	err = -ENOMEM;
-	dev = alloc_netdev_mqs(ops->priv_size, ifname, NET_NAME_UNKNOWN,
+	dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
 			       ops->setup, num_tx_queues, num_rx_queues);
 	if (!dev)
 		goto err;
@@ -1894,6 +1895,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 	char ifname[IFNAMSIZ];
 	struct nlattr *tb[IFLA_MAX+1];
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+	unsigned char name_assign_type = NET_NAME_USER;
 	int err;
 
 #ifdef CONFIG_MODULES
@@ -2046,14 +2048,16 @@ replay:
 		if (!ops->setup)
 			return -EOPNOTSUPP;
 
-		if (!ifname[0])
+		if (!ifname[0]) {
 			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+			name_assign_type = NET_NAME_ENUM;
+		}
 
 		dest_net = rtnl_link_get_net(net, tb);
 		if (IS_ERR(dest_net))
 			return PTR_ERR(dest_net);
 
-		dev = rtnl_create_link(dest_net, ifname, ops, tb);
+		dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
 		if (IS_ERR(dev)) {
 			err = PTR_ERR(dev);
 			goto out;
-- 
cgit 


From aee944ddf893e14a148af0f4e857c11887660053 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 14 Jul 2014 18:30:56 +0200
Subject: pktgen: remove unnecessary break after goto

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 5b05e364b8ae..8b849ddfef2e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -804,7 +804,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
 		case '\t':
 		case ' ':
 			goto done_str;
-			break;
 		default:
 			break;
 		}
-- 
cgit 


From 4d3520cb529e520554906573e4be0efaa38050fe Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 14 Jul 2014 18:30:57 +0200
Subject: drop_monitor: remove unnecessary break after return

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/drop_monitor.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index e70301eb7a4a..50f9a9db5792 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
 	switch (info->genlhdr->cmd) {
 	case NET_DM_CMD_START:
 		return set_all_monitor_traces(TRACE_ON);
-		break;
 	case NET_DM_CMD_STOP:
 		return set_all_monitor_traces(TRACE_OFF);
-		break;
 	}
 
 	return -ENOTSUPP;
-- 
cgit 


From a40e0a664bce465a3b8ad1d792153cef8ded9f7d Mon Sep 17 00:00:00 2001
From: françois romieu <romieu@fr.zoreil.com>
Date: Tue, 15 Jul 2014 23:55:35 +0200
Subject: net: remove open-coded skb_cow_head.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 138ab897de7d..239722af098d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2421,8 +2421,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 
 		skb_warn_bad_offload(skb);
 
-		if (skb_header_cloned(skb) &&
-		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		err = skb_cow_head(skb, 0);
+		if (err < 0)
 			return ERR_PTR(err);
 	}
 
-- 
cgit 


From c8a89c4a1d58230192cf7243520bf7f9899239f4 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 15 Jul 2014 15:15:20 -0700
Subject: rtnetlink: Drop unnecessary return value from ndo_dflt_fdb_del

This change cleans up ndo_dflt_fdb_del to drop the ENOTSUPP return value since
that isn't actually returned anywhere in the code.  As a result we are able to
drop a few lines by just defaulting this to -EINVAL.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e9918020dbc9..8d39071f32d7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2392,22 +2392,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
 		     struct net_device *dev,
 		     const unsigned char *addr)
 {
-	int err = -EOPNOTSUPP;
+	int err = -EINVAL;
 
 	/* If aging addresses are supported device will need to
 	 * implement its own handler for this.
 	 */
 	if (!(ndm->ndm_state & NUD_PERMANENT)) {
 		pr_info("%s: FDB only supports static addresses\n", dev->name);
-		return -EINVAL;
+		return err;
 	}
 
 	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
 		err = dev_uc_del(dev, addr);
 	else if (is_multicast_ether_addr(addr))
 		err = dev_mc_del(dev, addr);
-	else
-		err = -EINVAL;
 
 	return err;
 }
-- 
cgit 


From ccc7f4968a18b980994e622006b84e0195754390 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@gmail.com>
Date: Thu, 17 Jul 2014 19:46:10 +0200
Subject: net: print net_device reg_state in netdev_* unless it's registered

This way we'll always know in what status the device is, unless it's
running normally (i.e. NETDEV_REGISTERED).

Also, emit a warning once in case of a bad reg_state.

CC: "David S. Miller" <davem@davemloft.net>
CC: Jason Baron <jbaron@akamai.com>
CC: Eric Dumazet <edumazet@google.com>
CC: Vlad Yasevich <vyasevic@redhat.com>
CC: stephen hemminger <stephen@networkplumber.org>
CC: Jerry Chu <hkchu@google.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Joe Perches <joe@perches.com>
Signed-off-by: Veaceslav Falico <vfalico@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 239722af098d..81d61014fd9b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6950,12 +6950,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev,
 	if (dev && dev->dev.parent) {
 		r = dev_printk_emit(level[1] - '0',
 				    dev->dev.parent,
-				    "%s %s %s: %pV",
+				    "%s %s %s%s: %pV",
 				    dev_driver_string(dev->dev.parent),
 				    dev_name(dev->dev.parent),
-				    netdev_name(dev), vaf);
+				    netdev_name(dev), netdev_reg_state(dev),
+				    vaf);
 	} else if (dev) {
-		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+		r = printk("%s%s%s: %pV", level, netdev_name(dev),
+			   netdev_reg_state(dev), vaf);
 	} else {
 		r = printk("%s(NULL net_device): %pV", level, vaf);
 	}
-- 
cgit 


From 6fe82a39e583a50f28f03b294df79c9de9ec0de4 Mon Sep 17 00:00:00 2001
From: Veaceslav Falico <vfalico@gmail.com>
Date: Thu, 17 Jul 2014 20:33:32 +0200
Subject: net: print a notification on device rename

Currently it's done silently (from the kernel part), and thus it might be
hard to track the renames from logs.

Add a simple netdev_info() to notify the rename, but only in case the
previous name was valid.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Vlad Yasevich <vyasevic@redhat.com>
CC: stephen hemminger <stephen@networkplumber.org>
CC: Jerry Chu <hkchu@google.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Veaceslav Falico <vfalico@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 81d61014fd9b..e52a3788d18d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1113,6 +1113,9 @@ int dev_change_name(struct net_device *dev, const char *newname)
 		return err;
 	}
 
+	if (oldname[0] && !strchr(oldname, '%'))
+		netdev_info(dev, "renamed from %s\n", oldname);
+
 	old_assign_type = dev->name_assign_type;
 	dev->name_assign_type = NET_NAME_RENAMED;
 
-- 
cgit 


From 274f482d33a309c87096f2983601ceda2761094e Mon Sep 17 00:00:00 2001
From: Sorin Dumitru <sorin@returnze.ro>
Date: Tue, 22 Jul 2014 21:16:51 +0300
Subject: sock: remove skb argument from sk_rcvqueues_full

It hasn't been used since commit 0fd7bac(net: relax rcvbuf limits).

Signed-off-by: Sorin Dumitru <sorin@returnze.ro>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 026e01f70274..ca9b65199d28 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -491,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 
 	skb->dev = NULL;
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		atomic_inc(&sk->sk_drops);
 		goto discard_and_relse;
 	}
-- 
cgit 


From f5bffecda951b59d0d3cdd616d68952abc52bc40 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 22 Jul 2014 23:01:58 -0700
Subject: net: filter: split filter.c into two files

BPF is used in several kernel components. This split creates logical boundary
between generic eBPF core and the rest

kernel/bpf/core.c: eBPF interpreter

net/core/filter.c: classic->eBPF converter, classic verifiers, socket filters

This patch only moves functions.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 511 ------------------------------------------------------
 1 file changed, 511 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index b90ae7fb3b89..1d0e9492e4fa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -45,45 +45,6 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 
-/* Registers */
-#define BPF_R0	regs[BPF_REG_0]
-#define BPF_R1	regs[BPF_REG_1]
-#define BPF_R2	regs[BPF_REG_2]
-#define BPF_R3	regs[BPF_REG_3]
-#define BPF_R4	regs[BPF_REG_4]
-#define BPF_R5	regs[BPF_REG_5]
-#define BPF_R6	regs[BPF_REG_6]
-#define BPF_R7	regs[BPF_REG_7]
-#define BPF_R8	regs[BPF_REG_8]
-#define BPF_R9	regs[BPF_REG_9]
-#define BPF_R10	regs[BPF_REG_10]
-
-/* Named registers */
-#define DST	regs[insn->dst_reg]
-#define SRC	regs[insn->src_reg]
-#define FP	regs[BPF_REG_FP]
-#define ARG1	regs[BPF_REG_ARG1]
-#define CTX	regs[BPF_REG_CTX]
-#define IMM	insn->imm
-
-/* No hurry in this branch
- *
- * Exported for the bpf jit load helper.
- */
-void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
-{
-	u8 *ptr = NULL;
-
-	if (k >= SKF_NET_OFF)
-		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
-	else if (k >= SKF_LL_OFF)
-		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
-		return ptr;
-
-	return NULL;
-}
-
 /**
  *	sk_filter - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
@@ -126,451 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sk_filter);
 
-/* Base function for offset calculation. Needs to go into .text section,
- * therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
- */
-noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return 0;
-}
-
-/**
- *	__sk_run_filter - run a filter on a given context
- *	@ctx: buffer to run the filter on
- *	@insn: filter to apply
- *
- * Decode and apply filter instructions to the skb->data. Return length to
- * keep, 0 for none. @ctx is the data we are operating on, @insn is the
- * array of filter instructions.
- */
-static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
-{
-	u64 stack[MAX_BPF_STACK / sizeof(u64)];
-	u64 regs[MAX_BPF_REG], tmp;
-	static const void *jumptable[256] = {
-		[0 ... 255] = &&default_label,
-		/* Now overwrite non-defaults ... */
-		/* 32 bit ALU operations */
-		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
-		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
-		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
-		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
-		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
-		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
-		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
-		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
-		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
-		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
-		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
-		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
-		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
-		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
-		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
-		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
-		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
-		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
-		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
-		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
-		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
-		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
-		[BPF_ALU | BPF_NEG] = &&ALU_NEG,
-		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
-		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
-		/* 64 bit ALU operations */
-		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
-		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
-		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
-		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
-		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
-		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
-		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
-		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
-		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
-		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
-		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
-		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
-		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
-		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
-		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
-		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
-		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
-		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
-		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
-		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
-		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
-		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
-		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
-		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
-		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
-		/* Call instruction */
-		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
-		/* Jumps */
-		[BPF_JMP | BPF_JA] = &&JMP_JA,
-		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
-		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
-		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
-		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
-		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
-		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
-		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
-		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
-		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
-		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
-		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
-		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
-		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
-		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
-		/* Program return */
-		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
-		/* Store instructions */
-		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
-		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
-		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
-		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
-		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
-		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
-		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
-		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
-		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
-		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
-		/* Load instructions */
-		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
-		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
-		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
-		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
-		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
-		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
-		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
-		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
-		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
-		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
-	};
-	void *ptr;
-	int off;
-
-#define CONT	 ({ insn++; goto select_insn; })
-#define CONT_JMP ({ insn++; goto select_insn; })
-
-	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
-
-	/* Registers used in classic BPF programs need to be reset first. */
-	regs[BPF_REG_A] = 0;
-	regs[BPF_REG_X] = 0;
-
-select_insn:
-	goto *jumptable[insn->code];
-
-	/* ALU */
-#define ALU(OPCODE, OP)			\
-	ALU64_##OPCODE##_X:		\
-		DST = DST OP SRC;	\
-		CONT;			\
-	ALU_##OPCODE##_X:		\
-		DST = (u32) DST OP (u32) SRC;	\
-		CONT;			\
-	ALU64_##OPCODE##_K:		\
-		DST = DST OP IMM;		\
-		CONT;			\
-	ALU_##OPCODE##_K:		\
-		DST = (u32) DST OP (u32) IMM;	\
-		CONT;
-
-	ALU(ADD,  +)
-	ALU(SUB,  -)
-	ALU(AND,  &)
-	ALU(OR,   |)
-	ALU(LSH, <<)
-	ALU(RSH, >>)
-	ALU(XOR,  ^)
-	ALU(MUL,  *)
-#undef ALU
-	ALU_NEG:
-		DST = (u32) -DST;
-		CONT;
-	ALU64_NEG:
-		DST = -DST;
-		CONT;
-	ALU_MOV_X:
-		DST = (u32) SRC;
-		CONT;
-	ALU_MOV_K:
-		DST = (u32) IMM;
-		CONT;
-	ALU64_MOV_X:
-		DST = SRC;
-		CONT;
-	ALU64_MOV_K:
-		DST = IMM;
-		CONT;
-	ALU64_ARSH_X:
-		(*(s64 *) &DST) >>= SRC;
-		CONT;
-	ALU64_ARSH_K:
-		(*(s64 *) &DST) >>= IMM;
-		CONT;
-	ALU64_MOD_X:
-		if (unlikely(SRC == 0))
-			return 0;
-		tmp = DST;
-		DST = do_div(tmp, SRC);
-		CONT;
-	ALU_MOD_X:
-		if (unlikely(SRC == 0))
-			return 0;
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) SRC);
-		CONT;
-	ALU64_MOD_K:
-		tmp = DST;
-		DST = do_div(tmp, IMM);
-		CONT;
-	ALU_MOD_K:
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) IMM);
-		CONT;
-	ALU64_DIV_X:
-		if (unlikely(SRC == 0))
-			return 0;
-		do_div(DST, SRC);
-		CONT;
-	ALU_DIV_X:
-		if (unlikely(SRC == 0))
-			return 0;
-		tmp = (u32) DST;
-		do_div(tmp, (u32) SRC);
-		DST = (u32) tmp;
-		CONT;
-	ALU64_DIV_K:
-		do_div(DST, IMM);
-		CONT;
-	ALU_DIV_K:
-		tmp = (u32) DST;
-		do_div(tmp, (u32) IMM);
-		DST = (u32) tmp;
-		CONT;
-	ALU_END_TO_BE:
-		switch (IMM) {
-		case 16:
-			DST = (__force u16) cpu_to_be16(DST);
-			break;
-		case 32:
-			DST = (__force u32) cpu_to_be32(DST);
-			break;
-		case 64:
-			DST = (__force u64) cpu_to_be64(DST);
-			break;
-		}
-		CONT;
-	ALU_END_TO_LE:
-		switch (IMM) {
-		case 16:
-			DST = (__force u16) cpu_to_le16(DST);
-			break;
-		case 32:
-			DST = (__force u32) cpu_to_le32(DST);
-			break;
-		case 64:
-			DST = (__force u64) cpu_to_le64(DST);
-			break;
-		}
-		CONT;
-
-	/* CALL */
-	JMP_CALL:
-		/* Function call scratches BPF_R1-BPF_R5 registers,
-		 * preserves BPF_R6-BPF_R9, and stores return value
-		 * into BPF_R0.
-		 */
-		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
-						       BPF_R4, BPF_R5);
-		CONT;
-
-	/* JMP */
-	JMP_JA:
-		insn += insn->off;
-		CONT;
-	JMP_JEQ_X:
-		if (DST == SRC) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JEQ_K:
-		if (DST == IMM) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JNE_X:
-		if (DST != SRC) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JNE_K:
-		if (DST != IMM) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JGT_X:
-		if (DST > SRC) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JGT_K:
-		if (DST > IMM) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JGE_X:
-		if (DST >= SRC) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JGE_K:
-		if (DST >= IMM) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSGT_X:
-		if (((s64) DST) > ((s64) SRC)) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSGT_K:
-		if (((s64) DST) > ((s64) IMM)) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSGE_X:
-		if (((s64) DST) >= ((s64) SRC)) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSGE_K:
-		if (((s64) DST) >= ((s64) IMM)) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSET_X:
-		if (DST & SRC) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_JSET_K:
-		if (DST & IMM) {
-			insn += insn->off;
-			CONT_JMP;
-		}
-		CONT;
-	JMP_EXIT:
-		return BPF_R0;
-
-	/* STX and ST and LDX*/
-#define LDST(SIZEOP, SIZE)						\
-	STX_MEM_##SIZEOP:						\
-		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\
-		CONT;							\
-	ST_MEM_##SIZEOP:						\
-		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\
-		CONT;							\
-	LDX_MEM_##SIZEOP:						\
-		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
-		CONT;
-
-	LDST(B,   u8)
-	LDST(H,  u16)
-	LDST(W,  u32)
-	LDST(DW, u64)
-#undef LDST
-	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
-		atomic_add((u32) SRC, (atomic_t *)(unsigned long)
-			   (DST + insn->off));
-		CONT;
-	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
-		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
-			     (DST + insn->off));
-		CONT;
-	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
-		off = IMM;
-load_word:
-		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
-		 * only appearing in the programs where ctx ==
-		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
-		 * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
-		 * internal BPF verifier will check that BPF_R6 ==
-		 * ctx.
-		 *
-		 * BPF_ABS and BPF_IND are wrappers of function calls,
-		 * so they scratch BPF_R1-BPF_R5 registers, preserve
-		 * BPF_R6-BPF_R9, and store return value into BPF_R0.
-		 *
-		 * Implicit input:
-		 *   ctx == skb == BPF_R6 == CTX
-		 *
-		 * Explicit input:
-		 *   SRC == any register
-		 *   IMM == 32-bit immediate
-		 *
-		 * Output:
-		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
-		 */
-
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be32(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
-		off = IMM;
-load_half:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be16(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
-		off = IMM;
-load_byte:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = *(u8 *)ptr;
-			CONT;
-		}
-
-		return 0;
-	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_word;
-	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_half;
-	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
-		off = IMM + SRC;
-		goto load_byte;
-
-	default_label:
-		/* If we ever reach this, we have a bug somewhere. */
-		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
-		return 0;
-}
-
 /* Helper to find the offset of pkt_type in sk_buff structure. We want
  * to make sure its still a 3bit field starting at a byte boundary;
  * taken from arch/x86/net/bpf_jit_comp.c.
@@ -1455,33 +971,6 @@ out_err:
 	return ERR_PTR(err);
 }
 
-void __weak bpf_int_jit_compile(struct sk_filter *prog)
-{
-}
-
-/**
- *	sk_filter_select_runtime - select execution runtime for BPF program
- *	@fp: sk_filter populated with internal BPF program
- *
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via SK_RUN_FILTER() macro
- */
-void sk_filter_select_runtime(struct sk_filter *fp)
-{
-	fp->bpf_func = (void *) __sk_run_filter;
-
-	/* Probe if internal BPF can be JITed */
-	bpf_int_jit_compile(fp);
-}
-EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
-
-/* free internal BPF program */
-void sk_filter_free(struct sk_filter *fp)
-{
-	bpf_jit_free(fp);
-}
-EXPORT_SYMBOL_GPL(sk_filter_free);
-
 static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 					     struct sock *sk)
 {
-- 
cgit 


From 2695fb552cbef1029aa025a98acb80cc51d66de5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 24 Jul 2014 16:38:21 -0700
Subject: net: filter: rename 'struct sock_filter_int' into 'struct bpf_insn'

eBPF is used by socket filtering, seccomp and soon by tracing and
exposed to userspace, therefore 'sock_filter_int' name is not accurate.
Rename it to 'bpf_insn'

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 1d0e9492e4fa..f3b2d5e9fe5f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -174,9 +174,9 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 }
 
 static bool convert_bpf_extensions(struct sock_filter *fp,
-				   struct sock_filter_int **insnp)
+				   struct bpf_insn **insnp)
 {
-	struct sock_filter_int *insn = *insnp;
+	struct bpf_insn *insn = *insnp;
 
 	switch (fp->k) {
 	case SKF_AD_OFF + SKF_AD_PROTOCOL:
@@ -326,7 +326,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  *
  * 2) 2nd pass to remap in two passes: 1st pass finds new
  *    jump offsets, 2nd pass remapping:
- *   new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
+ *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
  *   sk_convert_filter(old_prog, old_len, new_prog, &new_len);
  *
  * User BPF's register A is mapped to our BPF register 6, user BPF
@@ -336,10 +336,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  * ctx == 'struct seccomp_data *'.
  */
 int sk_convert_filter(struct sock_filter *prog, int len,
-		      struct sock_filter_int *new_prog, int *new_len)
+		      struct bpf_insn *new_prog, int *new_len)
 {
 	int new_flen = 0, pass = 0, target, i;
-	struct sock_filter_int *new_insn;
+	struct bpf_insn *new_insn;
 	struct sock_filter *fp;
 	int *addrs = NULL;
 	u8 bpf_src;
@@ -365,8 +365,8 @@ do_pass:
 	new_insn++;
 
 	for (i = 0; i < len; fp++, i++) {
-		struct sock_filter_int tmp_insns[6] = { };
-		struct sock_filter_int *insn = tmp_insns;
+		struct bpf_insn tmp_insns[6] = { };
+		struct bpf_insn *insn = tmp_insns;
 
 		if (addrs)
 			addrs[i] = new_insn - new_prog;
@@ -913,7 +913,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 	 * representation.
 	 */
 	BUILD_BUG_ON(sizeof(struct sock_filter) !=
-		     sizeof(struct sock_filter_int));
+		     sizeof(struct bpf_insn));
 
 	/* Conversion cannot happen on overlapping memory areas,
 	 * so we need to keep the user BPF around until the 2nd
@@ -945,7 +945,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 
 	fp->len = new_len;
 
-	/* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
+	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
 	err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
 	if (err)
 		/* 2nd sk_convert_filter() can fail only if it fails
-- 
cgit 


From 6b53dafe23fd1f1228c7dd9b8a1323e757966160 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 23 Jul 2014 16:09:10 -0700
Subject: net: do not name the pointer to struct net_device net

"net" is normally for struct net*, pointer to struct net_device
should be named to either "dev" or "ndev" etc.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 142 +++++++++++++++++++++++++--------------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7752f2ad49a5..9dd06699b09c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev,
 			   struct device_attribute *attr, char *buf,
 			   ssize_t (*format)(const struct net_device *, char *))
 {
-	struct net_device *net = to_net_dev(dev);
+	struct net_device *ndev = to_net_dev(dev);
 	ssize_t ret = -EINVAL;
 
 	read_lock(&dev_base_lock);
-	if (dev_isalive(net))
-		ret = (*format)(net, buf);
+	if (dev_isalive(ndev))
+		ret = (*format)(ndev, buf);
 	read_unlock(&dev_base_lock);
 
 	return ret;
@@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev,
 
 /* generate a show function for simple field */
 #define NETDEVICE_SHOW(field, format_string)				\
-static ssize_t format_##field(const struct net_device *net, char *buf)	\
+static ssize_t format_##field(const struct net_device *dev, char *buf)	\
 {									\
-	return sprintf(buf, format_string, net->field);			\
+	return sprintf(buf, format_string, dev->field);			\
 }									\
 static ssize_t field##_show(struct device *dev,				\
 			    struct device_attribute *attr, char *buf)	\
@@ -112,19 +112,19 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec);
 NETDEVICE_SHOW_RO(type, fmt_dec);
 NETDEVICE_SHOW_RO(link_mode, fmt_dec);
 
-static ssize_t format_name_assign_type(const struct net_device *net, char *buf)
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
 {
-	return sprintf(buf, fmt_dec, net->name_assign_type);
+	return sprintf(buf, fmt_dec, dev->name_assign_type);
 }
 
 static ssize_t name_assign_type_show(struct device *dev,
 				     struct device_attribute *attr,
 				     char *buf)
 {
-	struct net_device *net = to_net_dev(dev);
+	struct net_device *ndev = to_net_dev(dev);
 	ssize_t ret = -EINVAL;
 
-	if (net->name_assign_type != NET_NAME_UNKNOWN)
+	if (ndev->name_assign_type != NET_NAME_UNKNOWN)
 		ret = netdev_show(dev, attr, buf, format_name_assign_type);
 
 	return ret;
@@ -135,12 +135,12 @@ static DEVICE_ATTR_RO(name_assign_type);
 static ssize_t address_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-	struct net_device *net = to_net_dev(dev);
+	struct net_device *ndev = to_net_dev(dev);
 	ssize_t ret = -EINVAL;
 
 	read_lock(&dev_base_lock);
-	if (dev_isalive(net))
-		ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
+	if (dev_isalive(ndev))
+		ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
 	read_unlock(&dev_base_lock);
 	return ret;
 }
@@ -149,18 +149,18 @@ static DEVICE_ATTR_RO(address);
 static ssize_t broadcast_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
-	struct net_device *net = to_net_dev(dev);
-	if (dev_isalive(net))
-		return sysfs_format_mac(buf, net->broadcast, net->addr_len);
+	struct net_device *ndev = to_net_dev(dev);
+	if (dev_isalive(ndev))
+		return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
 	return -EINVAL;
 }
 static DEVICE_ATTR_RO(broadcast);
 
-static int change_carrier(struct net_device *net, unsigned long new_carrier)
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
 {
-	if (!netif_running(net))
+	if (!netif_running(dev))
 		return -EINVAL;
-	return dev_change_carrier(net, (bool) new_carrier);
+	return dev_change_carrier(dev, (bool) new_carrier);
 }
 
 static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
@@ -284,9 +284,9 @@ static DEVICE_ATTR_RO(carrier_changes);
 
 /* read-write attributes */
 
-static int change_mtu(struct net_device *net, unsigned long new_mtu)
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
 {
-	return dev_set_mtu(net, (int) new_mtu);
+	return dev_set_mtu(dev, (int) new_mtu);
 }
 
 static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
@@ -296,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
 }
 NETDEVICE_SHOW_RW(mtu, fmt_dec);
 
-static int change_flags(struct net_device *net, unsigned long new_flags)
+static int change_flags(struct net_device *dev, unsigned long new_flags)
 {
-	return dev_change_flags(net, (unsigned int) new_flags);
+	return dev_change_flags(dev, (unsigned int) new_flags);
 }
 
 static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
@@ -308,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
 }
 NETDEVICE_SHOW_RW(flags, fmt_hex);
 
-static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 {
-	net->tx_queue_len = new_len;
+	dev->tx_queue_len = new_len;
 	return 0;
 }
 
@@ -363,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev,
 }
 static DEVICE_ATTR_RW(ifalias);
 
-static int change_group(struct net_device *net, unsigned long new_group)
+static int change_group(struct net_device *dev, unsigned long new_group)
 {
-	dev_set_group(net, (int) new_group);
+	dev_set_group(dev, (int) new_group);
 	return 0;
 }
 
@@ -796,20 +796,20 @@ static struct kobj_type rx_queue_ktype = {
 	.namespace = rx_queue_namespace
 };
 
-static int rx_queue_add_kobject(struct net_device *net, int index)
+static int rx_queue_add_kobject(struct net_device *dev, int index)
 {
-	struct netdev_rx_queue *queue = net->_rx + index;
+	struct netdev_rx_queue *queue = dev->_rx + index;
 	struct kobject *kobj = &queue->kobj;
 	int error = 0;
 
-	kobj->kset = net->queues_kset;
+	kobj->kset = dev->queues_kset;
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
 	    "rx-%u", index);
 	if (error)
 		goto exit;
 
-	if (net->sysfs_rx_queue_group) {
-		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+	if (dev->sysfs_rx_queue_group) {
+		error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
 		if (error)
 			goto exit;
 	}
@@ -825,18 +825,18 @@ exit:
 #endif /* CONFIG_SYSFS */
 
 int
-net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 {
 #ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
 #ifndef CONFIG_RPS
-	if (!net->sysfs_rx_queue_group)
+	if (!dev->sysfs_rx_queue_group)
 		return 0;
 #endif
 	for (i = old_num; i < new_num; i++) {
-		error = rx_queue_add_kobject(net, i);
+		error = rx_queue_add_kobject(dev, i);
 		if (error) {
 			new_num = old_num;
 			break;
@@ -844,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	}
 
 	while (--i >= new_num) {
-		if (net->sysfs_rx_queue_group)
-			sysfs_remove_group(&net->_rx[i].kobj,
-					   net->sysfs_rx_queue_group);
-		kobject_put(&net->_rx[i].kobj);
+		if (dev->sysfs_rx_queue_group)
+			sysfs_remove_group(&dev->_rx[i].kobj,
+					   dev->sysfs_rx_queue_group);
+		kobject_put(&dev->_rx[i].kobj);
 	}
 
 	return error;
@@ -1155,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = {
 	.namespace = netdev_queue_namespace,
 };
 
-static int netdev_queue_add_kobject(struct net_device *net, int index)
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
 {
-	struct netdev_queue *queue = net->_tx + index;
+	struct netdev_queue *queue = dev->_tx + index;
 	struct kobject *kobj = &queue->kobj;
 	int error = 0;
 
-	kobj->kset = net->queues_kset;
+	kobj->kset = dev->queues_kset;
 	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
 	    "tx-%u", index);
 	if (error)
@@ -1184,14 +1184,14 @@ exit:
 #endif /* CONFIG_SYSFS */
 
 int
-netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 {
 #ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
 	for (i = old_num; i < new_num; i++) {
-		error = netdev_queue_add_kobject(net, i);
+		error = netdev_queue_add_kobject(dev, i);
 		if (error) {
 			new_num = old_num;
 			break;
@@ -1199,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	}
 
 	while (--i >= new_num) {
-		struct netdev_queue *queue = net->_tx + i;
+		struct netdev_queue *queue = dev->_tx + i;
 
 #ifdef CONFIG_BQL
 		sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1213,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 #endif /* CONFIG_SYSFS */
 }
 
-static int register_queue_kobjects(struct net_device *net)
+static int register_queue_kobjects(struct net_device *dev)
 {
 	int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
 
 #ifdef CONFIG_SYSFS
-	net->queues_kset = kset_create_and_add("queues",
-	    NULL, &net->dev.kobj);
-	if (!net->queues_kset)
+	dev->queues_kset = kset_create_and_add("queues",
+	    NULL, &dev->dev.kobj);
+	if (!dev->queues_kset)
 		return -ENOMEM;
-	real_rx = net->real_num_rx_queues;
+	real_rx = dev->real_num_rx_queues;
 #endif
-	real_tx = net->real_num_tx_queues;
+	real_tx = dev->real_num_tx_queues;
 
-	error = net_rx_queue_update_kobjects(net, 0, real_rx);
+	error = net_rx_queue_update_kobjects(dev, 0, real_rx);
 	if (error)
 		goto error;
 	rxq = real_rx;
 
-	error = netdev_queue_update_kobjects(net, 0, real_tx);
+	error = netdev_queue_update_kobjects(dev, 0, real_tx);
 	if (error)
 		goto error;
 	txq = real_tx;
@@ -1239,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net)
 	return 0;
 
 error:
-	netdev_queue_update_kobjects(net, txq, 0);
-	net_rx_queue_update_kobjects(net, rxq, 0);
+	netdev_queue_update_kobjects(dev, txq, 0);
+	net_rx_queue_update_kobjects(dev, rxq, 0);
 	return error;
 }
 
-static void remove_queue_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *dev)
 {
 	int real_rx = 0, real_tx = 0;
 
 #ifdef CONFIG_SYSFS
-	real_rx = net->real_num_rx_queues;
+	real_rx = dev->real_num_rx_queues;
 #endif
-	real_tx = net->real_num_tx_queues;
+	real_tx = dev->real_num_tx_queues;
 
-	net_rx_queue_update_kobjects(net, real_rx, 0);
-	netdev_queue_update_kobjects(net, real_tx, 0);
+	net_rx_queue_update_kobjects(dev, real_rx, 0);
+	netdev_queue_update_kobjects(dev, real_tx, 0);
 #ifdef CONFIG_SYSFS
-	kset_unregister(net->queues_kset);
+	kset_unregister(dev->queues_kset);
 #endif
 }
 
@@ -1349,13 +1349,13 @@ static struct class net_class = {
 /* Delete sysfs entries but hold kobject reference until after all
  * netdev references are gone.
  */
-void netdev_unregister_kobject(struct net_device * net)
+void netdev_unregister_kobject(struct net_device *ndev)
 {
-	struct device *dev = &(net->dev);
+	struct device *dev = &(ndev->dev);
 
 	kobject_get(&dev->kobj);
 
-	remove_queue_kobjects(net);
+	remove_queue_kobjects(ndev);
 
 	pm_runtime_set_memalloc_noio(dev, false);
 
@@ -1363,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net)
 }
 
 /* Create sysfs entries for network device. */
-int netdev_register_kobject(struct net_device *net)
+int netdev_register_kobject(struct net_device *ndev)
 {
-	struct device *dev = &(net->dev);
-	const struct attribute_group **groups = net->sysfs_groups;
+	struct device *dev = &(ndev->dev);
+	const struct attribute_group **groups = ndev->sysfs_groups;
 	int error = 0;
 
 	device_initialize(dev);
 	dev->class = &net_class;
-	dev->platform_data = net;
+	dev->platform_data = ndev;
 	dev->groups = groups;
 
-	dev_set_name(dev, "%s", net->name);
+	dev_set_name(dev, "%s", ndev->name);
 
 #ifdef CONFIG_SYSFS
 	/* Allow for a device specific group */
@@ -1384,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net)
 	*groups++ = &netstat_group;
 
 #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
-	if (net->ieee80211_ptr)
+	if (ndev->ieee80211_ptr)
 		*groups++ = &wireless_group;
 #if IS_ENABLED(CONFIG_WIRELESS_EXT)
-	else if (net->wireless_handlers)
+	else if (ndev->wireless_handlers)
 		*groups++ = &wireless_group;
 #endif
 #endif
@@ -1397,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net)
 	if (error)
 		return error;
 
-	error = register_queue_kobjects(net);
+	error = register_queue_kobjects(ndev);
 	if (error) {
 		device_del(dev);
 		return error;
-- 
cgit 


From 4d276eb6a478307a28ae843836c455bf04b37a3c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 25 Jul 2014 18:01:32 -0400
Subject: net: remove deprecated syststamp timestamp

The SO_TIMESTAMPING API defines three types of timestamps: software,
hardware in raw format (hwtstamp) and hardware converted to system
format (syststamp). The last has been deprecated in favor of combining
hwtstamp with a PTP clock driver. There are no active users in the
kernel.

The option was device driver dependent. If set, but without hardware
support, the correct behavior is to return zero in the relevant field
in the SCM_TIMESTAMPING ancillary message. Without device drivers
implementing the option, this field is effectively always zero.

Remove the internal plumbing to dissuage new drivers from implementing
the feature. Keep the SOF_TIMESTAMPING_SYS_HARDWARE flag, however, to
avoid breaking existing applications that request the timestamp.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index ca9b65199d28..134291d73fcd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -862,8 +862,6 @@ set_rcvbuf:
 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 				  val & SOF_TIMESTAMPING_SOFTWARE);
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
-				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 		break;
@@ -1102,8 +1100,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
-			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 		break;
-- 
cgit 


From 80019d310f9fb4f8c9eeda0a5d76144ad3132fdf Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 30 Jul 2014 02:31:08 +0200
Subject: net: Remove unlikely() for WARN_ON() conditions

No need for the unlikely(), WARN_ON() and BUG_ON() internally use
unlikely() on the condition.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index e1b7cfaccd65..b370230fe1d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2326,7 +2326,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 	 */
 	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
 		if (vlan_depth) {
-			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+			if (WARN_ON(vlan_depth < VLAN_HLEN))
 				return 0;
 			vlan_depth -= VLAN_HLEN;
 		} else {
-- 
cgit 


From 34c5bd66e5ed2268bcb917b4cbdd6317023eada4 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Tue, 29 Jul 2014 17:36:28 +0200
Subject: net: filter: don't release unattached filter through call_rcu()

sk_unattached_filter_destroy() does not always need to release the
filter object via rcu. Since this filter is never attached to the
socket, the caller should be responsible for releasing the filter
in a safe way, which may not necessarily imply rcu.

This is a short summary of clients of this function:

1) xt_bpf.c and cls_bpf.c use the bpf matchers from rules, these rules
   are removed from the packet path before the filter is released. Thus,
   the framework makes sure the filter is safely removed.

2) In the ppp driver, the ppp_lock ensures serialization between the
   xmit and filter attachment/detachment path. This doesn't use rcu
   so deferred release via rcu makes no sense.

3) In the isdn/ppp driver, it is called from isdn_ppp_release()
   the isdn_ppp_ioctl(). This driver uses mutex and spinlocks, no rcu.
   Thus, deferred rcu makes no sense to me either, the deferred releases
   may be just masking the effects of wrong locking strategy, which
   should be fixed in the driver itself.

4) In the team driver, this is the only place where the rcu
   synchronization with unattached filter is used. Therefore, this
   patch introduces synchronize_rcu() which is called from the
   genetlink path to make sure the filter doesn't go away while packets
   are still walking over it. I think we can revisit this once struct
   bpf_prog (that only wraps specific bpf code bits) is in place, then
   add some specific struct rcu_head in the scope of the team driver if
   Jiri thinks this is needed.

Deferred rcu release for unattached filters was originally introduced
in 302d663 ("filter: Allow to create sk-unattached filters").

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index f3b2d5e9fe5f..42c1944b0c63 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -841,6 +841,12 @@ static void sk_release_orig_filter(struct sk_filter *fp)
 	}
 }
 
+static void __sk_filter_release(struct sk_filter *fp)
+{
+	sk_release_orig_filter(fp);
+	sk_filter_free(fp);
+}
+
 /**
  * 	sk_filter_release_rcu - Release a socket filter by rcu_head
  *	@rcu: rcu_head that contains the sk_filter to free
@@ -849,8 +855,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu)
 {
 	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
 
-	sk_release_orig_filter(fp);
-	sk_filter_free(fp);
+	__sk_filter_release(fp);
 }
 
 /**
@@ -1050,7 +1055,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
 
 void sk_unattached_filter_destroy(struct sk_filter *fp)
 {
-	sk_filter_release(fp);
+	__sk_filter_release(fp);
 }
 EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
 
-- 
cgit 


From fcdfe3a7fa4cb74391d42b6a26dc07c20dab1d82 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Thu, 31 Jul 2014 10:33:06 -0400
Subject: net: Correctly set segment mac_len in skb_segment().

When performing segmentation, the mac_len value is copied right
out of the original skb.  However, this value is not always set correctly
(like when the packet is VLAN-tagged) and we'll end up copying a bad
value.

One way to demonstrate this is to configure a VM which tags
packets internally and turn off VLAN acceleration on the forwarding
bridge port.  The packets show up corrupt like this:
16:18:24.985548 52:54:00:ab:be:25 > 52:54:00:26:ce:a3, ethertype 802.1Q
(0x8100), length 1518: vlan 100, p 0, ethertype 0x05e0,
        0x0000:  8cdb 1c7c 8cdb 0064 4006 b59d 0a00 6402 ...|...d@.....d.
        0x0010:  0a00 6401 9e0d b441 0a5e 64ec 0330 14fa ..d....A.^d..0..
        0x0020:  29e3 01c9 f871 0000 0101 080a 000a e833)....q.........3
        0x0030:  000f 8c75 6e65 7470 6572 6600 6e65 7470 ...unetperf.netp
        0x0040:  6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
        0x0050:  6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
        0x0060:  6572 6600 6e65 7470 6572 6600 6e65 7470 erf.netperf.netp
        ...

This also leads to awful throughput as GSO packets are dropped and
cause retransmissions.

The solution is to set the mac_len using the values already available
in then new skb.  We've already adjusted all of the header offset, so we
might as well correctly figure out the mac_len using skb_reset_mac_len().
After this change, packets are segmented correctly and performance
is restored.

CC: Eric Dumazet <edumazet@google.com>
Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c1a33033cbe2..58ff88edbefd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2976,9 +2976,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
 		tail = nskb;
 
 		__copy_skb_header(nskb, head_skb);
-		nskb->mac_len = head_skb->mac_len;
 
 		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+		skb_reset_mac_len(nskb);
 
 		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
 						 nskb->data - tnl_hlen,
-- 
cgit 


From 278571baca2aecf5fb5cb5c8b002dbfa0a6c524c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:12 -0700
Subject: net: filter: simplify socket charging

attaching bpf program to a socket involves multiple socket memory arithmetic,
since size of 'sk_filter' is changing when classic BPF is converted to eBPF.
Also common path of program creation has to deal with two ways of freeing
the memory.

Simplify the code by delaying socket charging until program is ready and
its size is known

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 87 +++++++++++++++++++++++--------------------------------
 net/core/sock.c   |  9 ++++--
 2 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 42c1944b0c63..5a6aeb1d40b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -872,41 +872,30 @@ static void sk_filter_release(struct sk_filter *fp)
 
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 {
-	atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
-	sk_filter_release(fp);
-}
+	u32 filter_size = sk_filter_size(fp->len);
 
-void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
-{
-	atomic_inc(&fp->refcnt);
-	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+	atomic_sub(filter_size, &sk->sk_omem_alloc);
+	sk_filter_release(fp);
 }
 
-static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
-					      struct sock *sk,
-					      unsigned int len)
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 {
-	struct sk_filter *fp_new;
-
-	if (sk == NULL)
-		return krealloc(fp, len, GFP_KERNEL);
-
-	fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
-	if (fp_new) {
-		*fp_new = *fp;
-		/* As we're keeping orig_prog in fp_new along,
-		 * we need to make sure we're not evicting it
-		 * from the old fp.
-		 */
-		fp->orig_prog = NULL;
-		sk_filter_uncharge(sk, fp);
+	u32 filter_size = sk_filter_size(fp->len);
+
+	/* same check as in sock_kmalloc() */
+	if (filter_size <= sysctl_optmem_max &&
+	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+		atomic_inc(&fp->refcnt);
+		atomic_add(filter_size, &sk->sk_omem_alloc);
+		return true;
 	}
-
-	return fp_new;
+	return false;
 }
 
-static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
-					     struct sock *sk)
+static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 {
 	struct sock_filter *old_prog;
 	struct sk_filter *old_fp;
@@ -938,7 +927,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 
 	/* Expand fp for appending the new filter representation. */
 	old_fp = fp;
-	fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+	fp = krealloc(old_fp, sk_filter_size(new_len), GFP_KERNEL);
 	if (!fp) {
 		/* The old_fp is still around in case we couldn't
 		 * allocate new memory, so uncharge on that one.
@@ -956,7 +945,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 		/* 2nd sk_convert_filter() can fail only if it fails
 		 * to allocate memory, remapping must succeed. Note,
 		 * that at this time old_fp has already been released
-		 * by __sk_migrate_realloc().
+		 * by krealloc().
 		 */
 		goto out_err_free;
 
@@ -968,16 +957,11 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
 out_err_free:
 	kfree(old_prog);
 out_err:
-	/* Rollback filter setup. */
-	if (sk != NULL)
-		sk_filter_uncharge(sk, fp);
-	else
-		kfree(fp);
+	__sk_filter_release(fp);
 	return ERR_PTR(err);
 }
 
-static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
-					     struct sock *sk)
+static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
 {
 	int err;
 
@@ -986,10 +970,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 
 	err = sk_chk_filter(fp->insns, fp->len);
 	if (err) {
-		if (sk != NULL)
-			sk_filter_uncharge(sk, fp);
-		else
-			kfree(fp);
+		__sk_filter_release(fp);
 		return ERR_PTR(err);
 	}
 
@@ -1002,7 +983,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
 	 * internal BPF translation for the optimized interpreter.
 	 */
 	if (!fp->jited)
-		fp = __sk_migrate_filter(fp, sk);
+		fp = __sk_migrate_filter(fp);
 
 	return fp;
 }
@@ -1041,10 +1022,10 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
 	 */
 	fp->orig_prog = NULL;
 
-	/* __sk_prepare_filter() already takes care of uncharging
+	/* __sk_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = __sk_prepare_filter(fp, NULL);
+	fp = __sk_prepare_filter(fp);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
@@ -1083,31 +1064,37 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
+	fp = kmalloc(sk_fsize, GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 
 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
-		sock_kfree_s(sk, fp, sk_fsize);
+		kfree(fp);
 		return -EFAULT;
 	}
 
-	atomic_set(&fp->refcnt, 1);
 	fp->len = fprog->len;
 
 	err = sk_store_orig_filter(fp, fprog);
 	if (err) {
-		sk_filter_uncharge(sk, fp);
+		kfree(fp);
 		return -ENOMEM;
 	}
 
-	/* __sk_prepare_filter() already takes care of uncharging
+	/* __sk_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = __sk_prepare_filter(fp, sk);
+	fp = __sk_prepare_filter(fp);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
+	atomic_set(&fp->refcnt, 0);
+
+	if (!sk_filter_charge(sk, fp)) {
+		__sk_filter_release(fp);
+		return -ENOMEM;
+	}
+
 	old_fp = rcu_dereference_protected(sk->sk_filter,
 					   sock_owned_by_user(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 134291d73fcd..a741163568fa 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1474,6 +1474,7 @@ static void sk_update_clone(const struct sock *sk, struct sock *newsk)
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 {
 	struct sock *newsk;
+	bool is_charged = true;
 
 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
 	if (newsk != NULL) {
@@ -1518,9 +1519,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
 		if (filter != NULL)
-			sk_filter_charge(newsk, filter);
+			/* though it's an empty new sock, the charging may fail
+			 * if sysctl_optmem_max was changed between creation of
+			 * original socket and cloning
+			 */
+			is_charged = sk_filter_charge(newsk, filter);
 
-		if (unlikely(xfrm_sk_clone_policy(newsk))) {
+		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
 			/* It is still raw copy of parent, so invalidate
 			 * destructor and make plain sk_free() */
 			newsk->sk_destruct = NULL;
-- 
cgit 


From 009937e78a45553a86d26654f192b2fd9ebe289d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:13 -0700
Subject: net: filter: rename sk_filter_proglen -> bpf_classic_proglen

trivial rename to better match semantics of macro

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c    | 8 ++++----
 net/core/sock_diag.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 5a6aeb1d40b8..d6cb287e4f59 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -813,7 +813,7 @@ EXPORT_SYMBOL(sk_chk_filter);
 static int sk_store_orig_filter(struct sk_filter *fp,
 				const struct sock_fprog *fprog)
 {
-	unsigned int fsize = sk_filter_proglen(fprog);
+	unsigned int fsize = bpf_classic_proglen(fprog);
 	struct sock_fprog_kern *fkprog;
 
 	fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
@@ -1001,7 +1001,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
 int sk_unattached_filter_create(struct sk_filter **pfp,
 				struct sock_fprog_kern *fprog)
 {
-	unsigned int fsize = sk_filter_proglen(fprog);
+	unsigned int fsize = bpf_classic_proglen(fprog);
 	struct sk_filter *fp;
 
 	/* Make sure new filter is there and in the right amounts. */
@@ -1053,7 +1053,7 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
-	unsigned int fsize = sk_filter_proglen(fprog);
+	unsigned int fsize = bpf_classic_proglen(fprog);
 	unsigned int sk_fsize = sk_filter_size(fprog->len);
 	int err;
 
@@ -1154,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 		goto out;
 
 	ret = -EFAULT;
-	if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
+	if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
 		goto out;
 
 	/* Instead of bytes, the API requests to return the number
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a4216a4c9572..57d922320c59 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -69,7 +69,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
 		goto out;
 
 	fprog = filter->orig_prog;
-	flen = sk_filter_proglen(fprog);
+	flen = bpf_classic_proglen(fprog);
 
 	attr = nla_reserve(skb, attrtype, flen);
 	if (attr == NULL) {
-- 
cgit 


From 4df95ff488eb796aab9566652c250330179def17 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:14 -0700
Subject: net: filter: rename sk_chk_filter() -> bpf_check_classic()

trivial rename to indicate that this functions performs classic BPF checking

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index d6cb287e4f59..5740ea08a3ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,7 +18,7 @@
  * 2 of the License, or (at your option) any later version.
  *
  * Andi Kleen - Fix a few bad bugs and races.
- * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  */
 
 #include <linux/module.h>
@@ -721,7 +721,7 @@ static bool chk_code_allowed(u16 code_to_probe)
 }
 
 /**
- *	sk_chk_filter - verify socket filter code
+ *	bpf_check_classic - verify socket filter code
  *	@filter: filter to verify
  *	@flen: length of filter
  *
@@ -734,7 +734,7 @@ static bool chk_code_allowed(u16 code_to_probe)
  *
  * Returns 0 if the rule set is legal or -EINVAL if not.
  */
-int sk_chk_filter(const struct sock_filter *filter, unsigned int flen)
+int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
 {
 	bool anc_found;
 	int pc;
@@ -808,7 +808,7 @@ int sk_chk_filter(const struct sock_filter *filter, unsigned int flen)
 
 	return -EINVAL;
 }
-EXPORT_SYMBOL(sk_chk_filter);
+EXPORT_SYMBOL(bpf_check_classic);
 
 static int sk_store_orig_filter(struct sk_filter *fp,
 				const struct sock_fprog *fprog)
@@ -968,7 +968,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
 	fp->bpf_func = NULL;
 	fp->jited = 0;
 
-	err = sk_chk_filter(fp->insns, fp->len);
+	err = bpf_check_classic(fp->insns, fp->len);
 	if (err) {
 		__sk_filter_release(fp);
 		return ERR_PTR(err);
-- 
cgit 


From 8fb575ca396bc31d9fa99c26336e2432b41d1bfc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:15 -0700
Subject: net: filter: rename sk_convert_filter() -> bpf_convert_filter()

to indicate that this function is converting classic BPF into eBPF
and not related to sockets

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 5740ea08a3ad..6ac901613bee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -312,7 +312,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 }
 
 /**
- *	sk_convert_filter - convert filter program
+ *	bpf_convert_filter - convert filter program
  *	@prog: the user passed filter program
  *	@len: the length of the user passed filter program
  *	@new_prog: buffer where converted program will be stored
@@ -322,12 +322,12 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  * Conversion workflow:
  *
  * 1) First pass for calculating the new program length:
- *   sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
  *
  * 2) 2nd pass to remap in two passes: 1st pass finds new
  *    jump offsets, 2nd pass remapping:
  *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
- *   sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
  *
  * User BPF's register A is mapped to our BPF register 6, user BPF
  * register X is mapped to BPF register 7; frame pointer is always
@@ -335,8 +335,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  * for socket filters: ctx == 'struct sk_buff *', for seccomp:
  * ctx == 'struct seccomp_data *'.
  */
-int sk_convert_filter(struct sock_filter *prog, int len,
-		      struct bpf_insn *new_prog, int *new_len)
+int bpf_convert_filter(struct sock_filter *prog, int len,
+		       struct bpf_insn *new_prog, int *new_len)
 {
 	int new_flen = 0, pass = 0, target, i;
 	struct bpf_insn *new_insn;
@@ -921,7 +921,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 	}
 
 	/* 1st pass: calculate the new program length. */
-	err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
 	if (err)
 		goto out_err_free;
 
@@ -940,9 +940,9 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 	fp->len = new_len;
 
 	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
-	err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
 	if (err)
-		/* 2nd sk_convert_filter() can fail only if it fails
+		/* 2nd bpf_convert_filter() can fail only if it fails
 		 * to allocate memory, remapping must succeed. Note,
 		 * that at this time old_fp has already been released
 		 * by krealloc().
-- 
cgit 


From 7ae457c1e5b45a1b826fad9d62b32191d2bdcfdb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 30 Jul 2014 20:34:16 -0700
Subject: net: filter: split 'struct sk_filter' into socket and bpf parts

clean up names related to socket filtering and bpf in the following way:
- everything that deals with sockets keeps 'sk_*' prefix
- everything that is pure BPF is changed to 'bpf_*' prefix

split 'struct sk_filter' into
struct sk_filter {
	atomic_t        refcnt;
	struct rcu_head rcu;
	struct bpf_prog *prog;
};
and
struct bpf_prog {
        u32                     jited:1,
                                len:31;
        struct sock_fprog_kern  *orig_prog;
        unsigned int            (*bpf_func)(const struct sk_buff *skb,
                                            const struct bpf_insn *filter);
        union {
                struct sock_filter      insns[0];
                struct bpf_insn         insnsi[0];
                struct work_struct      work;
        };
};
so that 'struct bpf_prog' can be used independent of sockets and cleans up
'unattached' bpf use cases

split SK_RUN_FILTER macro into:
    SK_RUN_FILTER to be used with 'struct sk_filter *' and
    BPF_PROG_RUN to be used with 'struct bpf_prog *'

__sk_filter_release(struct sk_filter *) gains
__bpf_prog_release(struct bpf_prog *) helper function

also perform related renames for the functions that work
with 'struct bpf_prog *', since they're on the same lines:

sk_filter_size -> bpf_prog_size
sk_filter_select_runtime -> bpf_prog_select_runtime
sk_filter_free -> bpf_prog_free
sk_unattached_filter_create -> bpf_prog_create
sk_unattached_filter_destroy -> bpf_prog_destroy
sk_store_orig_filter -> bpf_prog_store_orig_filter
sk_release_orig_filter -> bpf_release_orig_filter
__sk_migrate_filter -> bpf_migrate_filter
__sk_prepare_filter -> bpf_prepare_filter

API for attaching classic BPF to a socket stays the same:
sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *)
and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program
which is used by sockets, tun, af_packet

API for 'unattached' BPF programs becomes:
bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *)
and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program
which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c         | 92 ++++++++++++++++++++++++++---------------------
 net/core/ptp_classifier.c |  6 ++--
 net/core/sock_diag.c      |  2 +-
 3 files changed, 56 insertions(+), 44 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6ac901613bee..d814b8a89d0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -810,8 +810,8 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
 }
 EXPORT_SYMBOL(bpf_check_classic);
 
-static int sk_store_orig_filter(struct sk_filter *fp,
-				const struct sock_fprog *fprog)
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+				      const struct sock_fprog *fprog)
 {
 	unsigned int fsize = bpf_classic_proglen(fprog);
 	struct sock_fprog_kern *fkprog;
@@ -831,7 +831,7 @@ static int sk_store_orig_filter(struct sk_filter *fp,
 	return 0;
 }
 
-static void sk_release_orig_filter(struct sk_filter *fp)
+static void bpf_release_orig_filter(struct bpf_prog *fp)
 {
 	struct sock_fprog_kern *fprog = fp->orig_prog;
 
@@ -841,10 +841,16 @@ static void sk_release_orig_filter(struct sk_filter *fp)
 	}
 }
 
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+	bpf_release_orig_filter(prog);
+	bpf_prog_free(prog);
+}
+
 static void __sk_filter_release(struct sk_filter *fp)
 {
-	sk_release_orig_filter(fp);
-	sk_filter_free(fp);
+	__bpf_prog_release(fp->prog);
+	kfree(fp);
 }
 
 /**
@@ -872,7 +878,7 @@ static void sk_filter_release(struct sk_filter *fp)
 
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 {
-	u32 filter_size = sk_filter_size(fp->len);
+	u32 filter_size = bpf_prog_size(fp->prog->len);
 
 	atomic_sub(filter_size, &sk->sk_omem_alloc);
 	sk_filter_release(fp);
@@ -883,7 +889,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
  */
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 {
-	u32 filter_size = sk_filter_size(fp->len);
+	u32 filter_size = bpf_prog_size(fp->prog->len);
 
 	/* same check as in sock_kmalloc() */
 	if (filter_size <= sysctl_optmem_max &&
@@ -895,10 +901,10 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 	return false;
 }
 
-static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 {
 	struct sock_filter *old_prog;
-	struct sk_filter *old_fp;
+	struct bpf_prog *old_fp;
 	int err, new_len, old_len = fp->len;
 
 	/* We are free to overwrite insns et al right here as it
@@ -927,7 +933,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 
 	/* Expand fp for appending the new filter representation. */
 	old_fp = fp;
-	fp = krealloc(old_fp, sk_filter_size(new_len), GFP_KERNEL);
+	fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL);
 	if (!fp) {
 		/* The old_fp is still around in case we couldn't
 		 * allocate new memory, so uncharge on that one.
@@ -949,7 +955,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 		 */
 		goto out_err_free;
 
-	sk_filter_select_runtime(fp);
+	bpf_prog_select_runtime(fp);
 
 	kfree(old_prog);
 	return fp;
@@ -957,11 +963,11 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp)
 out_err_free:
 	kfree(old_prog);
 out_err:
-	__sk_filter_release(fp);
+	__bpf_prog_release(fp);
 	return ERR_PTR(err);
 }
 
-static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
 {
 	int err;
 
@@ -970,7 +976,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
 
 	err = bpf_check_classic(fp->insns, fp->len);
 	if (err) {
-		__sk_filter_release(fp);
+		__bpf_prog_release(fp);
 		return ERR_PTR(err);
 	}
 
@@ -983,13 +989,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
 	 * internal BPF translation for the optimized interpreter.
 	 */
 	if (!fp->jited)
-		fp = __sk_migrate_filter(fp);
+		fp = bpf_migrate_filter(fp);
 
 	return fp;
 }
 
 /**
- *	sk_unattached_filter_create - create an unattached filter
+ *	bpf_prog_create - create an unattached filter
  *	@pfp: the unattached filter that is created
  *	@fprog: the filter program
  *
@@ -998,23 +1004,21 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp)
  * If an error occurs or there is insufficient memory for the filter
  * a negative errno code is returned. On success the return is zero.
  */
-int sk_unattached_filter_create(struct sk_filter **pfp,
-				struct sock_fprog_kern *fprog)
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 {
 	unsigned int fsize = bpf_classic_proglen(fprog);
-	struct sk_filter *fp;
+	struct bpf_prog *fp;
 
 	/* Make sure new filter is there and in the right amounts. */
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
+	fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 
 	memcpy(fp->insns, fprog->filter, fsize);
 
-	atomic_set(&fp->refcnt, 1);
 	fp->len = fprog->len;
 	/* Since unattached filters are not copied back to user
 	 * space through sk_get_filter(), we do not need to hold
@@ -1022,23 +1026,23 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
 	 */
 	fp->orig_prog = NULL;
 
-	/* __sk_prepare_filter() already takes care of freeing
+	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = __sk_prepare_filter(fp);
+	fp = bpf_prepare_filter(fp);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
 	*pfp = fp;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
+EXPORT_SYMBOL_GPL(bpf_prog_create);
 
-void sk_unattached_filter_destroy(struct sk_filter *fp)
+void bpf_prog_destroy(struct bpf_prog *fp)
 {
-	__sk_filter_release(fp);
+	__bpf_prog_release(fp);
 }
-EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
 /**
  *	sk_attach_filter - attach a socket filter
@@ -1054,7 +1058,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
 	unsigned int fsize = bpf_classic_proglen(fprog);
-	unsigned int sk_fsize = sk_filter_size(fprog->len);
+	unsigned int bpf_fsize = bpf_prog_size(fprog->len);
+	struct bpf_prog *prog;
 	int err;
 
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
@@ -1064,29 +1069,36 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = kmalloc(sk_fsize, GFP_KERNEL);
-	if (!fp)
+	prog = kmalloc(bpf_fsize, GFP_KERNEL);
+	if (!prog)
 		return -ENOMEM;
 
-	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
-		kfree(fp);
+	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+		kfree(prog);
 		return -EFAULT;
 	}
 
-	fp->len = fprog->len;
+	prog->len = fprog->len;
 
-	err = sk_store_orig_filter(fp, fprog);
+	err = bpf_prog_store_orig_filter(prog, fprog);
 	if (err) {
-		kfree(fp);
+		kfree(prog);
 		return -ENOMEM;
 	}
 
-	/* __sk_prepare_filter() already takes care of freeing
+	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = __sk_prepare_filter(fp);
-	if (IS_ERR(fp))
-		return PTR_ERR(fp);
+	prog = bpf_prepare_filter(prog);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		__bpf_prog_release(prog);
+		return -ENOMEM;
+	}
+	fp->prog = prog;
 
 	atomic_set(&fp->refcnt, 0);
 
@@ -1142,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 	/* We're copying the filter that has been originally attached,
 	 * so no conversion/decode needed anymore.
 	 */
-	fprog = filter->orig_prog;
+	fprog = filter->prog->orig_prog;
 
 	ret = fprog->len;
 	if (!len)
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index 12ab7b4be609..4eab4a94a59d 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -107,11 +107,11 @@
 #include <linux/filter.h>
 #include <linux/ptp_classify.h>
 
-static struct sk_filter *ptp_insns __read_mostly;
+static struct bpf_prog *ptp_insns __read_mostly;
 
 unsigned int ptp_classify_raw(const struct sk_buff *skb)
 {
-	return SK_RUN_FILTER(ptp_insns, skb);
+	return BPF_PROG_RUN(ptp_insns, skb);
 }
 EXPORT_SYMBOL_GPL(ptp_classify_raw);
 
@@ -189,5 +189,5 @@ void __init ptp_classifier_init(void)
 		.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
 	};
 
-	BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
+	BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
 }
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 57d922320c59..ad704c757bb4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -68,7 +68,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
 	if (!filter)
 		goto out;
 
-	fprog = filter->orig_prog;
+	fprog = filter->prog->orig_prog;
 	flen = bpf_classic_proglen(fprog);
 
 	attr = nla_reserve(skb, attrtype, flen);
-- 
cgit 


From f24b9be5957b38bb420b838115040dc2031b7d0c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 4 Aug 2014 22:11:45 -0400
Subject: net-timestamp: extend SCM_TIMESTAMPING ancillary data struct

Applications that request kernel tx timestamps with SO_TIMESTAMPING
read timestamps as recvmsg() ancillary data. The response is defined
implicitly as timespec[3].

1) define struct scm_timestamping explicitly and

2) add support for new tstamp types. On tx, scm_timestamping always
   accompanies a sock_extended_err. Define previously unused field
   ee_info to signal the type of ts[0]. Introduce SCM_TSTAMP_SND to
   define the existing behavior.

The reception path is not modified. On rx, no struct similar to
sock_extended_err is passed along with SCM_TIMESTAMPING.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c1a33033cbe2..c9f68802e992 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3521,6 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+	serr->ee.ee_info = SCM_TSTAMP_SND;
 
 	err = sock_queue_err_skb(sk, skb);
 
-- 
cgit 


From b9f40e21ef4298650ab33e35740fa85bd57706d5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 4 Aug 2014 22:11:46 -0400
Subject: net-timestamp: move timestamp flags out of sk_flags

sk_flags is reaching its limit. New timestamping options will not fit.
Move all of them into a new field sk->sk_tsflags.

Added benefit is that this removes boilerplate code to convert between
SOF_TIMESTAMPING_.. and SOCK_TIMESTAMPING_.. in getsockopt/setsockopt.

SOCK_TIMESTAMPING_RX_SOFTWARE is also used to toggle the receive
timestamp logic (netstamp_needed). That can be simplified and this
last key removed, but will leave that for a separate patch.

Signed-off-by: Willem de Bruijn <willemb@google.com>

----

The u16 in sock can be moved into a 16-bit hole below sk_gso_max_segs,
though that scatters tstamp fields throughout the struct.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index a741163568fa..47c9377e14b9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -848,22 +848,13 @@ set_rcvbuf:
 			ret = -EINVAL;
 			break;
 		}
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
-				  val & SOF_TIMESTAMPING_TX_HARDWARE);
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
-				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
-				  val & SOF_TIMESTAMPING_RX_HARDWARE);
+		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 		else
 			sock_disable_timestamp(sk,
 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
-				  val & SOF_TIMESTAMPING_SOFTWARE);
-		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
-				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
 		break;
 
 	case SO_RCVLOWAT:
@@ -1089,19 +1080,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_TIMESTAMPING:
-		v.val = 0;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
-			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
-			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
-			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
-			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
-			v.val |= SOF_TIMESTAMPING_SOFTWARE;
-		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
-			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+		v.val = sk->sk_tsflags;
 		break;
 
 	case SO_RCVTIMEO:
-- 
cgit 


From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 4 Aug 2014 22:11:47 -0400
Subject: net-timestamp: add key to disambiguate concurrent datagrams

Datagrams timestamped on transmission can coexist in the kernel stack
and be reordered in packet scheduling. When reading looped datagrams
from the socket error queue it is not always possible to unique
correlate looped data with original send() call (for application
level retransmits). Even if possible, it may be expensive and complex,
requiring packet inspection.

Introduce a data-independent ID mechanism to associate timestamps with
send calls. Pass an ID alongside the timestamp in field ee_data of
sock_extended_err.

The ID is a simple 32 bit unsigned int that is associated with the
socket and incremented on each send() call for which software tx
timestamp generation is enabled.

The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to
avoid changing ee_data for existing applications that expect it 0.
The counter is reset each time the flag is reenabled. Reenabling
does not change the ID of already submitted data. It is possible
to receive out of order IDs if the timestamp stream is not quiesced
first.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 ++
 net/core/sock.c   | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c9f68802e992..0df4f1d14c5a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3522,6 +3522,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 	serr->ee.ee_info = SCM_TSTAMP_SND;
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+		serr->ee.ee_data = skb_shinfo(skb)->tskey;
 
 	err = sock_queue_err_skb(sk, skb);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 47c9377e14b9..1e0f1c63ad6b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -848,6 +848,9 @@ set_rcvbuf:
 			ret = -EINVAL;
 			break;
 		}
+		if (val & SOF_TIMESTAMPING_OPT_ID &&
+		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID))
+			sk->sk_tskey = 0;
 		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
-- 
cgit 


From e7fd2885385157d46c85f282fc6d7d297db43e1f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 4 Aug 2014 22:11:48 -0400
Subject: net-timestamp: SCHED timestamp on entering packet scheduler

Kernel transmit latency is often incurred in the packet scheduler.
Introduce a new timestamp on transmission just before entering the
scheduler. When data travels through multiple devices (bonding,
tunneling, ...) each device will export an individual timestamp.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c    |  4 ++++
 net/core/skbuff.c | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index b370230fe1d3..1c15b189c52b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
 #include <linux/hashtable.h>
 #include <linux/vmalloc.h>
 #include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
 
 #include "net-sysfs.h"
 
@@ -2876,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	skb_reset_mac_header(skb);
 
+	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0df4f1d14c5a..9705c0732aab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_queue_err_skb);
 
-void skb_tstamp_tx(struct sk_buff *orig_skb,
-		struct skb_shared_hwtstamps *hwtstamps)
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+		     struct skb_shared_hwtstamps *hwtstamps,
+		     struct sock *sk, int tstype)
 {
-	struct sock *sk = orig_skb->sk;
 	struct sock_exterr_skb *serr;
 	struct sk_buff *skb;
 	int err;
@@ -3521,7 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
-	serr->ee.ee_info = SCM_TSTAMP_SND;
+	serr->ee.ee_info = tstype;
 	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
 		serr->ee.ee_data = skb_shinfo(skb)->tskey;
 
@@ -3530,6 +3530,14 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (err)
 		kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+		   struct skb_shared_hwtstamps *hwtstamps)
+{
+	return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+			       SCM_TSTAMP_SND);
+}
 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
 
 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
-- 
cgit 


From 4ed2d765dfaccff5ebdac68e2064b59125033a3b Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 4 Aug 2014 22:11:49 -0400
Subject: net-timestamp: TCP timestamping

TCP timestamping extends SO_TIMESTAMPING to bytestreams.

Bytestreams do not have a 1:1 relationship between send() buffers and
network packets. The feature interprets a send call on a bytestream as
a request for a timestamp for the last byte in that send() buffer.

The choice corresponds to a request for a timestamp when all bytes in
the buffer have been sent. That assumption depends on in-order kernel
transmission. This is the common case. That said, it is possible to
construct a traffic shaping tree that would result in reordering.
The guarantee is strong, then, but not ironclad.

This implementation supports send and sendpages (splice). GSO replaces
one large packet with multiple smaller packets. This patch also copies
the option into the correct smaller packet.

This patch does not yet support timestamping on data in an initial TCP
Fast Open SYN, because that takes a very different data path.

If ID generation in ee_data is enabled, bytestream timestamps return a
byte offset, instead of the packet counter for datagrams.

The implementation supports a single timestamp per packet. It silenty
replaces requests for previous timestamps. To avoid missing tstamps,
flush the tcp queue by disabling Nagle, cork and autocork. Missing
tstamps can be detected by offset when the ee_data ID is enabled.

Implementation details:

- On GSO, the timestamping code can be included in the main loop. I
moved it into its own loop to reduce the impact on the common case
to a single branch.

- To avoid leaking the absolute seqno to userspace, the offset
returned in ee_data must always be relative. It is an offset between
an skb and sk field. The first is always set (also for GSO & ACK).
The second must also never be uninitialized. Only allow the ID
option on sockets in the ESTABLISHED state, for which the seqno
is available. Never reset it to zero (instead, move it to the
current seqno when reenabling the option).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c |  5 ++++-
 net/core/sock.c   | 13 +++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9705c0732aab..3dec0293a7c5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3522,8 +3522,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 	serr->ee.ee_info = tstype;
-	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
 		serr->ee.ee_data = skb_shinfo(skb)->tskey;
+		if (sk->sk_protocol == IPPROTO_TCP)
+			serr->ee.ee_data -= sk->sk_tskey;
+	}
 
 	err = sock_queue_err_skb(sk, skb);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e0f1c63ad6b..2714811afbd8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -849,8 +849,17 @@ set_rcvbuf:
 			break;
 		}
 		if (val & SOF_TIMESTAMPING_OPT_ID &&
-		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID))
-			sk->sk_tskey = 0;
+		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+			if (sk->sk_protocol == IPPROTO_TCP) {
+				if (sk->sk_state != TCP_ESTABLISHED) {
+					ret = -EINVAL;
+					break;
+				}
+				sk->sk_tskey = tcp_sk(sk)->snd_una;
+			} else {
+				sk->sk_tskey = 0;
+			}
+		}
 		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
-- 
cgit