summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c55
-rw-r--r--net/core/dev.c21
-rw-r--r--net/core/dst.c6
-rw-r--r--net/core/fib_notifier.c164
-rw-r--r--net/core/fib_rules.c69
-rw-r--r--net/core/filter.c280
-rw-r--r--net/core/flow_dissector.c59
-rw-r--r--net/core/lwtunnel.c28
-rw-r--r--net/core/neighbour.c10
-rw-r--r--net/core/net-sysfs.c222
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/net_namespace.c5
-rw-r--r--net/core/rtnetlink.c249
-rw-r--r--net/core/skbuff.c364
-rw-r--r--net/core/sock.c47
16 files changed, 1197 insertions, 386 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index d501c4278015..56d771a887b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
+ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
+ fib_notifier.o
obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..2f3277945d35 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -573,27 +573,12 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-/**
- * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
- * @skb: buffer to copy
- * @from: the source to copy from
- *
- * The function will first copy up to headlen, and then pin the userspace
- * pages and build frags through them.
- *
- * Returns 0, -EFAULT or -EMSGSIZE.
- */
-int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
- int len = iov_iter_count(from);
- int copy = min_t(int, skb_headlen(skb), len);
- int frag = 0;
+ int frag = skb_shinfo(skb)->nr_frags;
- /* copy up to skb headlen */
- if (skb_copy_datagram_from_iter(skb, 0, from, copy))
- return -EFAULT;
-
- while (iov_iter_count(from)) {
+ while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
size_t start;
ssize_t copied;
@@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- copied = iov_iter_get_pages(from, pages, ~0U,
+ copied = iov_iter_get_pages(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
iov_iter_advance(from, copied);
+ length -= copied;
truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
skb->truesize += truesize;
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk->sk_wmem_queued += truesize;
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
while (copied) {
int size = min_t(int, copied, PAGE_SIZE - start);
skb_fill_page_desc(skb, frag++, pages[n], start, size);
@@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
}
return 0;
}
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ea6b4b42611..40b28e417072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
- if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
refcount_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@ -2731,8 +2731,7 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL &&
- skb->ip_summed != CHECKSUM_NONE;
+ return skb->ip_summed != CHECKSUM_PARTIAL;
return skb->ip_summed == CHECKSUM_NONE;
}
@@ -3920,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* When doing generic XDP we have to bypass the qdisc layer and the
* network taps in order to match in-driver-XDP behavior.
*/
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -3941,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
kfree_skb(skb);
}
}
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
static struct static_key generic_xdp_needed __read_mostly;
-static int do_xdp_generic(struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
- struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-
if (xdp_prog) {
u32 act = netif_receive_generic_xdp(skb, xdp_prog);
int err;
@@ -3972,6 +3970,7 @@ out_redir:
kfree_skb(skb);
return XDP_DROP;
}
+EXPORT_SYMBOL_GPL(do_xdp_generic);
static int netif_rx_internal(struct sk_buff *skb)
{
@@ -3982,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
/* Consider XDP consuming the packet a success from
* the netdev point of view we do not want to count
@@ -4412,7 +4412,7 @@ skip_classify:
}
if (pt_prev) {
- if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@ -4503,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
rcu_read_lock();
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
if (ret != XDP_PASS) {
rcu_read_unlock();
diff --git a/net/core/dst.c b/net/core/dst.c
index 00aa972ad1a1..d6ead757c258 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -55,7 +55,7 @@ const struct dst_metrics dst_default_metrics = {
* We really want to avoid false sharing on this variable, and catch
* any writes on it.
*/
- .refcnt = ATOMIC_INIT(1),
+ .refcnt = REFCOUNT_INIT(1),
};
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
@@ -213,7 +213,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
unsigned long prev, new;
- atomic_set(&p->refcnt, 1);
+ refcount_set(&p->refcnt, 1);
memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
new = (unsigned long) p;
@@ -225,7 +225,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
if (prev & DST_METRICS_READ_ONLY)
p = NULL;
} else if (prev & DST_METRICS_REFCOUNTED) {
- if (atomic_dec_and_test(&old_p->refcnt))
+ if (refcount_dec_and_test(&old_p->refcnt))
kfree(old_p);
}
}
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
new file mode 100644
index 000000000000..292aab83702f
--- /dev/null
+++ b/net/core/fib_notifier.c
@@ -0,0 +1,164 @@
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+EXPORT_SYMBOL(call_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
+}
+EXPORT_SYMBOL(call_fib_notifiers);
+
+static unsigned int fib_seq_sum(void)
+{
+ struct fib_notifier_ops *ops;
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net) {
+ list_for_each_entry(ops, &net->fib_notifier_ops, list)
+ fib_seq += ops->fib_seq_read(net);
+ }
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static int fib_net_dump(struct net *net, struct notifier_block *nb)
+{
+ struct fib_notifier_ops *ops;
+
+ list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
+ int err = ops->fib_dump(net, nb);
+
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+ int err;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ err = fib_net_dump(net, nb);
+ if (err)
+ goto err_fib_net_dump;
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
+
+err_fib_net_dump:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
+ struct net *net)
+{
+ struct fib_notifier_ops *o;
+
+ list_for_each_entry(o, &net->fib_notifier_ops, list)
+ if (ops->family == o->family)
+ return -EEXIST;
+ list_add_tail_rcu(&ops->list, &net->fib_notifier_ops);
+ return 0;
+}
+
+struct fib_notifier_ops *
+fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net)
+{
+ struct fib_notifier_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return ERR_PTR(-ENOMEM);
+
+ err = __fib_notifier_ops_register(ops, net);
+ if (err)
+ goto err_register;
+
+ return ops;
+
+err_register:
+ kfree(ops);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fib_notifier_ops_register);
+
+void fib_notifier_ops_unregister(struct fib_notifier_ops *ops)
+{
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL(fib_notifier_ops_unregister);
+
+static int __net_init fib_notifier_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->fib_notifier_ops);
+ return 0;
+}
+
+static struct pernet_operations fib_notifier_net_ops = {
+ .init = fib_notifier_net_init,
+};
+
+static int __init fib_notifier_init(void)
+{
+ return register_pernet_subsys(&fib_notifier_net_ops);
+}
+
+subsys_initcall(fib_notifier_init);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index fdcb1bcd2afa..9a6d97c1d810 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -299,6 +299,67 @@ out:
}
EXPORT_SYMBOL_GPL(fib_rules_lookup);
+static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule, int family)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = family,
+ .rule = rule,
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib_rule *rule,
+ struct fib_rules_ops *ops)
+{
+ struct fib_rule_notifier_info info = {
+ .info.family = ops->family,
+ .rule = rule,
+ };
+
+ ops->fib_rules_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+/* Called with rcu_read_lock() */
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+{
+ struct fib_rules_ops *ops;
+ struct fib_rule *rule;
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return -EAFNOSUPPORT;
+ list_for_each_entry_rcu(rule, &ops->rules_list, list)
+ call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
+ family);
+ rules_ops_put(ops);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fib_rules_dump);
+
+unsigned int fib_rules_seq_read(struct net *net, int family)
+{
+ unsigned int fib_rules_seq;
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+
+ ops = lookup_rules_ops(net, family);
+ if (!ops)
+ return 0;
+ fib_rules_seq = ops->fib_rules_seq;
+ rules_ops_put(ops);
+
+ return fib_rules_seq;
+}
+EXPORT_SYMBOL_GPL(fib_rules_seq_read);
+
static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
struct fib_rules_ops *ops)
{
@@ -548,6 +609,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rule->tun_id)
ip_tunnel_need_metadata();
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops);
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -687,6 +749,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops);
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
NETLINK_CB(skb).portid);
fib_rule_put(rule);
@@ -963,9 +1026,9 @@ static struct pernet_operations fib_rules_net_ops = {
static int __init fib_rules_init(void)
{
int err;
- rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index 7e9708653c6f..fa2115695037 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -514,14 +514,27 @@ do_pass:
break;
}
- /* Convert JEQ into JNE when 'jump_true' is next insn. */
- if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
- insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ /* Convert some jumps when 'jump_true' is next insn. */
+ if (fp->jt == 0) {
+ switch (BPF_OP(fp->code)) {
+ case BPF_JEQ:
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ break;
+ case BPF_JGT:
+ insn->code = BPF_JMP | BPF_JLE | bpf_src;
+ break;
+ case BPF_JGE:
+ insn->code = BPF_JMP | BPF_JLT | bpf_src;
+ break;
+ default:
+ goto jmp_rest;
+ }
+
target = i + fp->jf + 1;
BPF_EMIT_JMP;
break;
}
-
+jmp_rest:
/* Other jumps are mapped into two insns: Jxx and JA. */
target = i + fp->jt + 1;
insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
@@ -1845,6 +1858,45 @@ static const struct bpf_func_proto bpf_redirect_map_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ if (unlikely(flags))
+ return SK_ABORTED;
+
+ ri->ifindex = key;
+ ri->flags = flags;
+ ri->map = map;
+
+ return SK_REDIRECT;
+}
+
+struct sock *do_sk_redirect_map(void)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct sock *sk = NULL;
+
+ if (ri->map) {
+ sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+
+ ri->ifindex = 0;
+ ri->map = NULL;
+ /* we do not clear flags for future lookup */
+ }
+
+ return sk;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -2483,14 +2535,16 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_map *map = ri->map;
u32 index = ri->ifindex;
struct net_device *fwd;
- int err = -EINVAL;
+ int err;
ri->ifindex = 0;
ri->map = NULL;
fwd = __dev_map_lookup_elem(map, index);
- if (!fwd)
+ if (!fwd) {
+ err = -EINVAL;
goto out;
+ }
if (ri->map_to_flush && (ri->map_to_flush != map))
xdp_do_flush_map();
@@ -2500,7 +2554,7 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
ri->map_to_flush = map;
out:
- trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
+ trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
return err;
}
@@ -2509,21 +2563,24 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct net_device *fwd;
+ u32 index = ri->ifindex;
+ int err;
if (ri->map)
return xdp_do_redirect_map(dev, xdp, xdp_prog);
- fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
ri->ifindex = 0;
- ri->map = NULL;
if (unlikely(!fwd)) {
- bpf_warn_invalid_xdp_redirect(ri->ifindex);
- return -EINVAL;
+ bpf_warn_invalid_xdp_redirect(index);
+ err = -EINVAL;
+ goto out;
}
- trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
-
- return __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+out:
+ trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+ return err;
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -2531,11 +2588,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned int len;
+ u32 index = ri->ifindex;
- dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
+ dev = dev_get_by_index_rcu(dev_net(dev), index);
ri->ifindex = 0;
if (unlikely(!dev)) {
- bpf_warn_invalid_xdp_redirect(ri->ifindex);
+ bpf_warn_invalid_xdp_redirect(index);
goto err;
}
@@ -3214,6 +3272,32 @@ static const struct bpf_func_proto *
switch (func_id) {
case BPF_FUNC_setsockopt:
return &bpf_setsockopt_proto;
+ case BPF_FUNC_sock_map_update:
+ return &bpf_sock_map_update_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_sk_redirect_map:
+ return &bpf_sk_redirect_map_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3271,6 +3355,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+ case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
@@ -3299,6 +3387,7 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -3320,6 +3409,7 @@ static bool lwt_is_valid_access(int off, int size,
{
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -3370,8 +3460,8 @@ static bool sock_filter_is_valid_access(int off, int size,
return true;
}
-static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
- const struct bpf_prog *prog)
+static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog, int drop_verdict)
{
struct bpf_insn *insn = insn_buf;
@@ -3398,7 +3488,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
* return TC_ACT_SHOT;
*/
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
- *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
*insn++ = BPF_EXIT_INSN();
/* restore: */
@@ -3409,6 +3499,12 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
return insn - insn_buf;
}
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
+}
+
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
@@ -3433,6 +3529,8 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
}
return bpf_skb_is_valid_access(off, size, type, info);
@@ -3510,6 +3608,41 @@ static bool sock_ops_is_valid_access(int off, int size,
return __is_valid_sock_ops_access(off, size);
}
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
+}
+
+static bool sk_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, info);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -3675,6 +3808,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct sk_buff, tc_index, 2,
target_size));
#else
+ *target_size = 2;
if (type == BPF_WRITE)
*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
else
@@ -3690,9 +3824,110 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
+ *target_size = 4;
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
break;
+ case offsetof(struct __sk_buff, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ 2, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_daddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_rcv_saddr,
+ 4, target_size));
+ break;
+ case offsetof(struct __sk_buff, remote_ip6[0]) ...
+ offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ case offsetof(struct __sk_buff, local_ip6[0]) ...
+ offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_dport,
+ 2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct __sk_buff, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct sock_common,
+ skc_num, 2, target_size));
+ break;
}
return insn - insn_buf;
@@ -3977,6 +4212,13 @@ const struct bpf_verifier_ops sock_ops_prog_ops = {
.convert_ctx_access = sock_ops_convert_ctx_access,
};
+const struct bpf_verifier_ops sk_skb_prog_ops = {
+ .get_func_proto = sk_skb_func_proto,
+ .is_valid_access = sk_skb_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+ .gen_prologue = sk_skb_prologue,
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index fc5fc4594c90..e2eaa1ff948d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -4,6 +4,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
+#include <net/dsa.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/gre.h>
@@ -440,6 +441,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
skb->vlan_proto : skb->protocol;
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
+#if IS_ENABLED(CONFIG_NET_DSA)
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) {
+ const struct dsa_device_ops *ops;
+ int offset;
+
+ ops = skb->dev->dsa_ptr->tag_ops;
+ if (ops->flow_dissect &&
+ !ops->flow_dissect(skb, &proto, &offset)) {
+ hlen -= offset;
+ nhoff += offset;
+ }
+ }
+#endif
}
/* It is ensured by skb_flow_dissector_init() that control key will
@@ -998,51 +1012,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
-{
- struct flow_keys keys;
-
- memset(&keys, 0, sizeof(keys));
-
- memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
- sizeof(keys.addrs.v6addrs.src));
- memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
- sizeof(keys.addrs.v6addrs.dst));
- keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
- keys.ports.src = fl6->fl6_sport;
- keys.ports.dst = fl6->fl6_dport;
- keys.keyid.keyid = fl6->fl6_gre_key;
- keys.tags.flow_label = (__force u32)fl6->flowlabel;
- keys.basic.ip_proto = fl6->flowi6_proto;
-
- __skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
- flow_keys_have_l4(&keys));
-
- return skb->hash;
-}
-EXPORT_SYMBOL(__skb_get_hash_flowi6);
-
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
-{
- struct flow_keys keys;
-
- memset(&keys, 0, sizeof(keys));
-
- keys.addrs.v4addrs.src = fl4->saddr;
- keys.addrs.v4addrs.dst = fl4->daddr;
- keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
- keys.ports.src = fl4->fl4_sport;
- keys.ports.dst = fl4->fl4_dport;
- keys.keyid.keyid = fl4->fl4_gre_key;
- keys.basic.ip_proto = fl4->flowi4_proto;
-
- __skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
- flow_keys_have_l4(&keys));
-
- return skb->hash;
-}
-EXPORT_SYMBOL(__skb_get_hash_flowi4);
-
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
const struct flow_keys *keys, int hlen)
{
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index d9cb3532f1dd..0b171756453c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -44,6 +44,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "SEG6";
case LWTUNNEL_ENCAP_BPF:
return "BPF";
+ case LWTUNNEL_ENCAP_SEG6_LOCAL:
+ return "SEG6LOCAL";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -65,7 +67,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
return lws;
}
-EXPORT_SYMBOL(lwtunnel_state_alloc);
+EXPORT_SYMBOL_GPL(lwtunnel_state_alloc);
static const struct lwtunnel_encap_ops __rcu *
lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
@@ -80,7 +82,7 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
&lwtun_encaps[num],
NULL, ops) ? 0 : -1;
}
-EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops);
int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
unsigned int encap_type)
@@ -99,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
return ret;
}
-EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
int lwtunnel_build_state(u16 encap_type,
struct nlattr *encap, unsigned int family,
@@ -138,7 +140,7 @@ int lwtunnel_build_state(u16 encap_type,
return ret;
}
-EXPORT_SYMBOL(lwtunnel_build_state);
+EXPORT_SYMBOL_GPL(lwtunnel_build_state);
int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
{
@@ -175,7 +177,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
return ret;
}
-EXPORT_SYMBOL(lwtunnel_valid_encap_type);
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type);
int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
struct netlink_ext_ack *extack)
@@ -205,7 +207,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
return 0;
}
-EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
+EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr);
void lwtstate_free(struct lwtunnel_state *lws)
{
@@ -219,7 +221,7 @@ void lwtstate_free(struct lwtunnel_state *lws)
}
module_put(ops->owner);
}
-EXPORT_SYMBOL(lwtstate_free);
+EXPORT_SYMBOL_GPL(lwtstate_free);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
{
@@ -259,7 +261,7 @@ nla_put_failure:
return (ret == -EOPNOTSUPP ? 0 : ret);
}
-EXPORT_SYMBOL(lwtunnel_fill_encap);
+EXPORT_SYMBOL_GPL(lwtunnel_fill_encap);
int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
{
@@ -281,7 +283,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
return ret;
}
-EXPORT_SYMBOL(lwtunnel_get_encap_size);
+EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size);
int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
@@ -309,7 +311,7 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
return ret;
}
-EXPORT_SYMBOL(lwtunnel_cmp_encap);
+EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -343,7 +345,7 @@ drop:
return ret;
}
-EXPORT_SYMBOL(lwtunnel_output);
+EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
@@ -378,7 +380,7 @@ drop:
return ret;
}
-EXPORT_SYMBOL(lwtunnel_xmit);
+EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
@@ -412,4 +414,4 @@ drop:
return ret;
}
-EXPORT_SYMBOL(lwtunnel_input);
+EXPORT_SYMBOL_GPL(lwtunnel_input);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d0713627deb6..16a1a4c4eb57 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3261,13 +3261,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
static int __init neigh_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
- NULL);
- rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
+ 0);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
return 0;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b4f9922b6f23..927a6dcbad96 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -97,7 +97,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
return restart_syscall();
if (dev_isalive(netdev)) {
- if ((ret = (*set)(netdev, new)) == 0)
+ ret = (*set)(netdev, new);
+ if (ret == 0)
ret = len;
}
rtnl_unlock();
@@ -160,6 +161,7 @@ static ssize_t broadcast_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *ndev = to_net_dev(dev);
+
if (dev_isalive(ndev))
return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
return -EINVAL;
@@ -170,7 +172,7 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier)
{
if (!netif_running(dev))
return -EINVAL;
- return dev_change_carrier(dev, (bool) new_carrier);
+ return dev_change_carrier(dev, (bool)new_carrier);
}
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
@@ -183,9 +185,10 @@ static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- if (netif_running(netdev)) {
+
+ if (netif_running(netdev))
return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
- }
+
return -EINVAL;
}
static DEVICE_ATTR_RW(carrier);
@@ -290,6 +293,7 @@ static ssize_t carrier_changes_show(struct device *dev,
char *buf)
{
struct net_device *netdev = to_net_dev(dev);
+
return sprintf(buf, fmt_dec,
atomic_read(&netdev->carrier_changes));
}
@@ -299,7 +303,7 @@ static DEVICE_ATTR_RO(carrier_changes);
static int change_mtu(struct net_device *dev, unsigned long new_mtu)
{
- return dev_set_mtu(dev, (int) new_mtu);
+ return dev_set_mtu(dev, (int)new_mtu);
}
static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
@@ -311,7 +315,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec);
static int change_flags(struct net_device *dev, unsigned long new_flags)
{
- return dev_change_flags(dev, (unsigned int) new_flags);
+ return dev_change_flags(dev, (unsigned int)new_flags);
}
static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
@@ -362,8 +366,8 @@ static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
}
static ssize_t gro_flush_timeout_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -412,7 +416,7 @@ static DEVICE_ATTR_RW(ifalias);
static int change_group(struct net_device *dev, unsigned long new_group)
{
- dev_set_group(dev, (int) new_group);
+ dev_set_group(dev, (int)new_group);
return 0;
}
@@ -426,7 +430,7 @@ static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
static int change_proto_down(struct net_device *dev, unsigned long proto_down)
{
- return dev_change_proto_down(dev, (bool) proto_down);
+ return dev_change_proto_down(dev, (bool)proto_down);
}
static ssize_t proto_down_store(struct device *dev,
@@ -508,7 +512,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_switch_id);
-static struct attribute *net_class_attrs[] = {
+static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
&dev_attr_dev_id.attr,
@@ -549,14 +553,14 @@ static ssize_t netstat_show(const struct device *d,
ssize_t ret = -EINVAL;
WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
- offset % sizeof(u64) != 0);
+ offset % sizeof(u64) != 0);
read_lock(&dev_base_lock);
if (dev_isalive(dev)) {
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
+ ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
read_unlock(&dev_base_lock);
return ret;
@@ -565,7 +569,7 @@ static ssize_t netstat_show(const struct device *d,
/* generate a read-only statistics attribute */
#define NETSTAT_ENTRY(name) \
static ssize_t name##_show(struct device *d, \
- struct device_attribute *attr, char *buf) \
+ struct device_attribute *attr, char *buf) \
{ \
return netstat_show(d, attr, buf, \
offsetof(struct rtnl_link_stats64, name)); \
@@ -597,7 +601,7 @@ NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
NETSTAT_ENTRY(rx_nohandler);
-static struct attribute *netstat_attrs[] = {
+static struct attribute *netstat_attrs[] __ro_after_init = {
&dev_attr_rx_packets.attr,
&dev_attr_tx_packets.attr,
&dev_attr_rx_bytes.attr,
@@ -625,7 +629,6 @@ static struct attribute *netstat_attrs[] = {
NULL
};
-
static const struct attribute_group netstat_group = {
.name = "statistics",
.attrs = netstat_attrs,
@@ -647,33 +650,33 @@ static const struct attribute_group wireless_group = {
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_SYSFS
-#define to_rx_queue_attr(_attr) container_of(_attr, \
- struct rx_queue_attribute, attr)
+#define to_rx_queue_attr(_attr) \
+ container_of(_attr, struct rx_queue_attribute, attr)
#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
- struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
struct netdev_rx_queue *queue = to_rx_queue(kobj);
if (!attribute->show)
return -EIO;
- return attribute->show(queue, attribute, buf);
+ return attribute->show(queue, buf);
}
static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
- struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
struct netdev_rx_queue *queue = to_rx_queue(kobj);
if (!attribute->store)
return -EIO;
- return attribute->store(queue, attribute, buf, count);
+ return attribute->store(queue, buf, count);
}
static const struct sysfs_ops rx_queue_sysfs_ops = {
@@ -682,8 +685,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
};
#ifdef CONFIG_RPS
-static ssize_t show_rps_map(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attribute, char *buf)
+static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
{
struct rps_map *map;
cpumask_var_t mask;
@@ -706,8 +708,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
}
static ssize_t store_rps_map(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attribute,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
struct rps_map *old_map, *map;
cpumask_var_t mask;
@@ -727,8 +728,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
}
map = kzalloc(max_t(unsigned int,
- RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
- GFP_KERNEL);
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
if (!map) {
free_cpumask_var(mask);
return -ENOMEM;
@@ -738,9 +739,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
for_each_cpu_and(cpu, mask, cpu_online_mask)
map->cpus[i++] = cpu;
- if (i)
+ if (i) {
map->len = i;
- else {
+ } else {
kfree(map);
map = NULL;
}
@@ -765,7 +766,6 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
}
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr,
char *buf)
{
struct rps_dev_flow_table *flow_table;
@@ -788,8 +788,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu)
}
static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
- struct rx_queue_attribute *attr,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
unsigned long mask, count;
struct rps_dev_flow_table *table, *old_table;
@@ -831,8 +830,9 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
table->mask = mask;
for (count = 0; count <= mask; count++)
table->flows[count].cpu = RPS_NO_CPU;
- } else
+ } else {
table = NULL;
+ }
spin_lock(&rps_dev_flow_lock);
old_table = rcu_dereference_protected(queue->rps_flow_table,
@@ -846,16 +846,15 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
return len;
}
-static struct rx_queue_attribute rps_cpus_attribute =
- __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
-
+static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
+ = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
-static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
- __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
- show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
+ = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
#endif /* CONFIG_RPS */
-static struct attribute *rx_queue_default_attrs[] = {
+static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
#ifdef CONFIG_RPS
&rps_cpus_attribute.attr,
&rps_dev_flow_table_cnt_attribute.attr,
@@ -870,7 +869,6 @@ static void rx_queue_release(struct kobject *kobj)
struct rps_map *map;
struct rps_dev_flow_table *flow_table;
-
map = rcu_dereference_protected(queue->rps_map, 1);
if (map) {
RCU_INIT_POINTER(queue->rps_map, NULL);
@@ -900,7 +898,7 @@ static const void *rx_queue_namespace(struct kobject *kobj)
return ns;
}
-static struct kobj_type rx_queue_ktype = {
+static struct kobj_type rx_queue_ktype __ro_after_init = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_attrs = rx_queue_default_attrs,
@@ -915,23 +913,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
- "rx-%u", index);
+ "rx-%u", index);
if (error)
- goto exit;
+ return error;
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
- if (error)
- goto exit;
+ if (error) {
+ kobject_put(kobj);
+ return error;
+ }
}
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
return error;
-exit:
- kobject_put(kobj);
- return error;
}
#endif /* CONFIG_SYSFS */
@@ -976,39 +973,40 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
*/
struct netdev_queue_attribute {
struct attribute attr;
- ssize_t (*show)(struct netdev_queue *queue,
- struct netdev_queue_attribute *attr, char *buf);
+ ssize_t (*show)(struct netdev_queue *queue, char *buf);
ssize_t (*store)(struct netdev_queue *queue,
- struct netdev_queue_attribute *attr, const char *buf, size_t len);
+ const char *buf, size_t len);
};
-#define to_netdev_queue_attr(_attr) container_of(_attr, \
- struct netdev_queue_attribute, attr)
+#define to_netdev_queue_attr(_attr) \
+ container_of(_attr, struct netdev_queue_attribute, attr)
#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
static ssize_t netdev_queue_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
- struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
struct netdev_queue *queue = to_netdev_queue(kobj);
if (!attribute->show)
return -EIO;
- return attribute->show(queue, attribute, buf);
+ return attribute->show(queue, buf);
}
static ssize_t netdev_queue_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t count)
{
- struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ const struct netdev_queue_attribute *attribute
+ = to_netdev_queue_attr(attr);
struct netdev_queue *queue = to_netdev_queue(kobj);
if (!attribute->store)
return -EIO;
- return attribute->store(queue, attribute, buf, count);
+ return attribute->store(queue, buf, count);
}
static const struct sysfs_ops netdev_queue_sysfs_ops = {
@@ -1016,9 +1014,7 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
-static ssize_t show_trans_timeout(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
- char *buf)
+static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
{
unsigned long trans_timeout;
@@ -1040,8 +1036,7 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
-static ssize_t show_traffic_class(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
+static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
{
struct net_device *dev = queue->dev;
@@ -1055,16 +1050,14 @@ static ssize_t show_traffic_class(struct netdev_queue *queue,
}
#ifdef CONFIG_XPS
-static ssize_t show_tx_maxrate(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
+static ssize_t tx_maxrate_show(struct netdev_queue *queue,
char *buf)
{
return sprintf(buf, "%lu\n", queue->tx_maxrate);
}
-static ssize_t set_tx_maxrate(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
- const char *buf, size_t len)
+static ssize_t tx_maxrate_store(struct netdev_queue *queue,
+ const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
int err, index = get_netdev_queue_index(queue);
@@ -1089,16 +1082,15 @@ static ssize_t set_tx_maxrate(struct netdev_queue *queue,
return err;
}
-static struct netdev_queue_attribute queue_tx_maxrate =
- __ATTR(tx_maxrate, S_IRUGO | S_IWUSR,
- show_tx_maxrate, set_tx_maxrate);
+static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init
+ = __ATTR_RW(tx_maxrate);
#endif
-static struct netdev_queue_attribute queue_trans_timeout =
- __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+static struct netdev_queue_attribute queue_trans_timeout __ro_after_init
+ = __ATTR_RO(tx_timeout);
-static struct netdev_queue_attribute queue_traffic_class =
- __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+static struct netdev_queue_attribute queue_traffic_class __ro_after_init
+ = __ATTR_RO(traffic_class);
#ifdef CONFIG_BQL
/*
@@ -1115,9 +1107,9 @@ static ssize_t bql_set(const char *buf, const size_t count,
unsigned int value;
int err;
- if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) {
value = DQL_MAX_LIMIT;
- else {
+ } else {
err = kstrtouint(buf, 10, &value);
if (err < 0)
return err;
@@ -1131,7 +1123,6 @@ static ssize_t bql_set(const char *buf, const size_t count,
}
static ssize_t bql_show_hold_time(struct netdev_queue *queue,
- struct netdev_queue_attribute *attr,
char *buf)
{
struct dql *dql = &queue->dql;
@@ -1140,7 +1131,6 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue,
}
static ssize_t bql_set_hold_time(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
const char *buf, size_t len)
{
struct dql *dql = &queue->dql;
@@ -1156,12 +1146,11 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue,
return len;
}
-static struct netdev_queue_attribute bql_hold_time_attribute =
- __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
- bql_set_hold_time);
+static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
+ = __ATTR(hold_time, S_IRUGO | S_IWUSR,
+ bql_show_hold_time, bql_set_hold_time);
static ssize_t bql_show_inflight(struct netdev_queue *queue,
- struct netdev_queue_attribute *attr,
char *buf)
{
struct dql *dql = &queue->dql;
@@ -1169,33 +1158,31 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue,
return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
}
-static struct netdev_queue_attribute bql_inflight_attribute =
+static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
__ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
- struct netdev_queue_attribute *attr, \
char *buf) \
{ \
return bql_show(buf, queue->dql.FIELD); \
} \
\
static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
- struct netdev_queue_attribute *attr, \
const char *buf, size_t len) \
{ \
return bql_set(buf, len, &queue->dql.FIELD); \
} \
\
-static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
- __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
- bql_set_ ## NAME);
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \
+ = __ATTR(NAME, S_IRUGO | S_IWUSR, \
+ bql_show_ ## NAME, bql_set_ ## NAME)
-BQL_ATTR(limit, limit)
-BQL_ATTR(limit_max, max_limit)
-BQL_ATTR(limit_min, min_limit)
+BQL_ATTR(limit, limit);
+BQL_ATTR(limit_max, max_limit);
+BQL_ATTR(limit_min, min_limit);
-static struct attribute *dql_attrs[] = {
+static struct attribute *dql_attrs[] __ro_after_init = {
&bql_limit_attribute.attr,
&bql_limit_max_attribute.attr,
&bql_limit_min_attribute.attr,
@@ -1211,8 +1198,8 @@ static const struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static ssize_t show_xps_map(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute, char *buf)
+static ssize_t xps_cpus_show(struct netdev_queue *queue,
+ char *buf)
{
struct net_device *dev = queue->dev;
int cpu, len, num_tc = 1, tc = 0;
@@ -1258,9 +1245,8 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t store_xps_map(struct netdev_queue *queue,
- struct netdev_queue_attribute *attribute,
- const char *buf, size_t len)
+static ssize_t xps_cpus_store(struct netdev_queue *queue,
+ const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
unsigned long index;
@@ -1288,11 +1274,11 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
return err ? : len;
}
-static struct netdev_queue_attribute xps_cpus_attribute =
- __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
+ = __ATTR_RW(xps_cpus);
#endif /* CONFIG_XPS */
-static struct attribute *netdev_queue_default_attrs[] = {
+static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
&queue_trans_timeout.attr,
&queue_traffic_class.attr,
#ifdef CONFIG_XPS
@@ -1322,7 +1308,7 @@ static const void *netdev_queue_namespace(struct kobject *kobj)
return ns;
}
-static struct kobj_type netdev_queue_ktype = {
+static struct kobj_type netdev_queue_ktype __ro_after_init = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
.default_attrs = netdev_queue_default_attrs,
@@ -1337,23 +1323,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
- "tx-%u", index);
+ "tx-%u", index);
if (error)
- goto exit;
+ return error;
#ifdef CONFIG_BQL
error = sysfs_create_group(kobj, &dql_group);
- if (error)
- goto exit;
+ if (error) {
+ kobject_put(kobj);
+ return error;
+ }
#endif
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
return 0;
-exit:
- kobject_put(kobj);
- return error;
}
#endif /* CONFIG_SYSFS */
@@ -1395,7 +1380,7 @@ static int register_queue_kobjects(struct net_device *dev)
#ifdef CONFIG_SYSFS
dev->queues_kset = kset_create_and_add("queues",
- NULL, &dev->dev.kobj);
+ NULL, &dev->dev.kobj);
if (!dev->queues_kset)
return -ENOMEM;
real_rx = dev->real_num_rx_queues;
@@ -1463,7 +1448,7 @@ static const void *net_netlink_ns(struct sock *sk)
return sock_net(sk);
}
-struct kobj_ns_type_operations net_ns_type_operations = {
+const struct kobj_ns_type_operations net_ns_type_operations = {
.type = KOBJ_NS_TYPE_NET,
.current_may_mount = net_current_may_mount,
.grab_current_ns = net_grab_current_ns,
@@ -1485,7 +1470,8 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
/* pass ifindex to uevent.
* ifindex is useful as it won't change (interface name may change)
- * and is what RtNetlink uses natively. */
+ * and is what RtNetlink uses natively.
+ */
retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
exit:
@@ -1513,7 +1499,7 @@ static const void *net_namespace(struct device *d)
return dev_net(dev);
}
-static struct class net_class = {
+static struct class net_class __ro_after_init = {
.name = "net",
.dev_release = netdev_release,
.dev_groups = net_class_groups,
@@ -1560,7 +1546,7 @@ EXPORT_SYMBOL(of_find_net_device_by_node);
*/
void netdev_unregister_kobject(struct net_device *ndev)
{
- struct device *dev = &(ndev->dev);
+ struct device *dev = &ndev->dev;
if (!atomic_read(&dev_net(ndev)->count))
dev_set_uevent_suppress(dev, 1);
@@ -1577,7 +1563,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
/* Create sysfs entries for network device. */
int netdev_register_kobject(struct net_device *ndev)
{
- struct device *dev = &(ndev->dev);
+ struct device *dev = &ndev->dev;
const struct attribute_group **groups = ndev->sysfs_groups;
int error = 0;
@@ -1620,14 +1606,14 @@ int netdev_register_kobject(struct net_device *ndev)
return error;
}
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
return class_create_file_ns(&net_class, class_attr, ns);
}
EXPORT_SYMBOL(netdev_class_create_file_ns);
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
class_remove_file_ns(&net_class, class_attr, ns);
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 92da5e4ceb4f..4f1468ccd056 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -32,6 +32,7 @@
#include <trace/events/sock.h>
#include <trace/events/udp.h>
#include <trace/events/fib.h>
+#include <trace/events/qdisc.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 8726d051f31d..6cfdc7c84c48 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -855,9 +855,10 @@ static int __init net_ns_init(void)
register_pernet_subsys(&net_ns_ops);
- rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
- NULL);
+ RTNL_FLAG_DOIT_UNLOCKED);
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9201e3621351..a78fd61da0ec 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,7 +62,7 @@
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
- rtnl_calcit_func calcit;
+ unsigned int flags;
};
static DEFINE_MUTEX(rtnl_mutex);
@@ -127,7 +127,8 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
-static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
{
@@ -143,58 +144,13 @@ static inline int rtm_msgindex(int msgtype)
return msgindex;
}
-static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
-{
- struct rtnl_link *tab;
-
- if (protocol <= RTNL_FAMILY_MAX)
- tab = rtnl_msg_handlers[protocol];
- else
- tab = NULL;
-
- if (tab == NULL || tab[msgindex].doit == NULL)
- tab = rtnl_msg_handlers[PF_UNSPEC];
-
- return tab[msgindex].doit;
-}
-
-static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
-{
- struct rtnl_link *tab;
-
- if (protocol <= RTNL_FAMILY_MAX)
- tab = rtnl_msg_handlers[protocol];
- else
- tab = NULL;
-
- if (tab == NULL || tab[msgindex].dumpit == NULL)
- tab = rtnl_msg_handlers[PF_UNSPEC];
-
- return tab[msgindex].dumpit;
-}
-
-static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
-{
- struct rtnl_link *tab;
-
- if (protocol <= RTNL_FAMILY_MAX)
- tab = rtnl_msg_handlers[protocol];
- else
- tab = NULL;
-
- if (tab == NULL || tab[msgindex].calcit == NULL)
- tab = rtnl_msg_handlers[PF_UNSPEC];
-
- return tab[msgindex].calcit;
-}
-
/**
* __rtnl_register - Register a rtnetlink message type
* @protocol: Protocol family or PF_UNSPEC
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @calcit: Function pointer to calc size of dump message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
*
* Registers the specified function pointers (at least one of them has
* to be non-NULL) to be called whenever a request message for the
@@ -208,7 +164,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
*/
int __rtnl_register(int protocol, int msgtype,
rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- rtnl_calcit_func calcit)
+ unsigned int flags)
{
struct rtnl_link *tab;
int msgindex;
@@ -216,23 +172,20 @@ int __rtnl_register(int protocol, int msgtype,
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
msgindex = rtm_msgindex(msgtype);
- tab = rtnl_msg_handlers[protocol];
+ tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
if (tab == NULL)
return -ENOBUFS;
- rtnl_msg_handlers[protocol] = tab;
+ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
}
if (doit)
tab[msgindex].doit = doit;
-
if (dumpit)
tab[msgindex].dumpit = dumpit;
-
- if (calcit)
- tab[msgindex].calcit = calcit;
+ tab[msgindex].flags |= flags;
return 0;
}
@@ -249,9 +202,9 @@ EXPORT_SYMBOL_GPL(__rtnl_register);
*/
void rtnl_register(int protocol, int msgtype,
rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- rtnl_calcit_func calcit)
+ unsigned int flags)
{
- if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
+ if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
panic("Unable to register rtnetlink message handler, "
"protocol = %d, message type = %d\n",
protocol, msgtype);
@@ -267,17 +220,23 @@ EXPORT_SYMBOL_GPL(rtnl_register);
*/
int rtnl_unregister(int protocol, int msgtype)
{
+ struct rtnl_link *handlers;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
msgindex = rtm_msgindex(msgtype);
- if (rtnl_msg_handlers[protocol] == NULL)
+ rtnl_lock();
+ handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ if (!handlers) {
+ rtnl_unlock();
return -ENOENT;
+ }
- rtnl_msg_handlers[protocol][msgindex].doit = NULL;
- rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
- rtnl_msg_handlers[protocol][msgindex].calcit = NULL;
+ handlers[msgindex].doit = NULL;
+ handlers[msgindex].dumpit = NULL;
+ handlers[msgindex].flags = 0;
+ rtnl_unlock();
return 0;
}
@@ -292,10 +251,20 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
*/
void rtnl_unregister_all(int protocol)
{
+ struct rtnl_link *handlers;
+
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
- kfree(rtnl_msg_handlers[protocol]);
- rtnl_msg_handlers[protocol] = NULL;
+ rtnl_lock();
+ handlers = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
+ rtnl_unlock();
+
+ synchronize_net();
+
+ while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1)
+ schedule();
+ kfree(handlers);
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);
@@ -433,16 +402,24 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
{
struct net_device *master_dev;
const struct rtnl_link_ops *ops;
+ size_t size = 0;
- master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ rcu_read_lock();
+
+ master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
if (!master_dev)
- return 0;
+ goto out;
+
ops = master_dev->rtnl_link_ops;
if (!ops || !ops->get_slave_size)
- return 0;
+ goto out;
/* IFLA_INFO_SLAVE_DATA + nested data */
- return nla_total_size(sizeof(struct nlattr)) +
+ size = nla_total_size(sizeof(struct nlattr)) +
ops->get_slave_size(master_dev, dev);
+
+out:
+ rcu_read_unlock();
+ return size;
}
static size_t rtnl_link_get_size(const struct net_device *dev)
@@ -1644,8 +1621,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0];
s_idx = cb->args[1];
- cb->seq = net->dev_base_seq;
-
/* A hack to preserve kernel<->userspace interface.
* The correct header is ifinfomsg. It is consistent with rtnl_getlink.
* However, before Linux v3.9 the code here assumed rtgenmsg and that's
@@ -1691,8 +1666,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
goto out_err;
}
-
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
idx++;
}
@@ -1702,6 +1675,8 @@ out:
out_err:
cb->args[1] = idx;
cb->args[0] = h;
+ cb->seq = net->dev_base_seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
return err;
}
@@ -2831,11 +2806,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
* traverse the list of net devices and compute the minimum
* buffer size based upon the filter mask.
*/
- list_for_each_entry(dev, &net->dev_base_head, dev_list) {
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
if_nlmsg_size(dev,
ext_filter_mask));
}
+ rcu_read_unlock();
return nlmsg_total_size(min_ifinfo_dump_size);
}
@@ -2847,19 +2824,29 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (s_idx == 0)
s_idx = 1;
+
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
int type = cb->nlh->nlmsg_type-RTM_BASE;
+ struct rtnl_link *handlers;
+ rtnl_dumpit_func dumpit;
+
if (idx < s_idx || idx == PF_PACKET)
continue;
- if (rtnl_msg_handlers[idx] == NULL ||
- rtnl_msg_handlers[idx][type].dumpit == NULL)
+
+ handlers = rtnl_dereference(rtnl_msg_handlers[idx]);
+ if (!handlers)
continue;
+
+ dumpit = READ_ONCE(handlers[type].dumpit);
+ if (!dumpit)
+ continue;
+
if (idx > s_idx) {
memset(&cb->args[0], 0, sizeof(cb->args));
cb->prev_seq = 0;
cb->seq = 0;
}
- if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
+ if (dumpit(skb, cb))
break;
}
cb->family = idx;
@@ -4162,11 +4149,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct rtnl_link *handlers;
+ int err = -EOPNOTSUPP;
rtnl_doit_func doit;
+ unsigned int flags;
int kind;
int family;
int type;
- int err;
type = nlh->nlmsg_type;
if (type > RTM_MAX)
@@ -4184,20 +4173,40 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
+ if (family >= ARRAY_SIZE(rtnl_msg_handlers))
+ family = PF_UNSPEC;
+
+ rcu_read_lock();
+ handlers = rcu_dereference(rtnl_msg_handlers[family]);
+ if (!handlers) {
+ family = PF_UNSPEC;
+ handlers = rcu_dereference(rtnl_msg_handlers[family]);
+ }
+
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
- rtnl_calcit_func calcit;
u16 min_dump_alloc = 0;
- dumpit = rtnl_get_dumpit(family, type);
- if (dumpit == NULL)
- return -EOPNOTSUPP;
- calcit = rtnl_get_calcit(family, type);
- if (calcit)
- min_dump_alloc = calcit(skb, nlh);
+ dumpit = READ_ONCE(handlers[type].dumpit);
+ if (!dumpit) {
+ family = PF_UNSPEC;
+ handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]);
+ if (!handlers)
+ goto err_unlock;
+
+ dumpit = READ_ONCE(handlers[type].dumpit);
+ if (!dumpit)
+ goto err_unlock;
+ }
+
+ refcount_inc(&rtnl_msg_handlers_ref[family]);
+
+ if (type == RTM_GETLINK - RTM_BASE)
+ min_dump_alloc = rtnl_calcit(skb, nlh);
+
+ rcu_read_unlock();
- __rtnl_unlock();
rtnl = net->rtnl;
{
struct netlink_dump_control c = {
@@ -4206,22 +4215,47 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
};
err = netlink_dump_start(rtnl, skb, nlh, &c);
}
- rtnl_lock();
+ refcount_dec(&rtnl_msg_handlers_ref[family]);
return err;
}
- doit = rtnl_get_doit(family, type);
- if (doit == NULL)
- return -EOPNOTSUPP;
+ doit = READ_ONCE(handlers[type].doit);
+ if (!doit) {
+ family = PF_UNSPEC;
+ handlers = rcu_dereference(rtnl_msg_handlers[family]);
+ }
+
+ flags = READ_ONCE(handlers[type].flags);
+ if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
+ refcount_inc(&rtnl_msg_handlers_ref[family]);
+ doit = READ_ONCE(handlers[type].doit);
+ rcu_read_unlock();
+ if (doit)
+ err = doit(skb, nlh, extack);
+ refcount_dec(&rtnl_msg_handlers_ref[family]);
+ return err;
+ }
- return doit(skb, nlh, extack);
+ rcu_read_unlock();
+
+ rtnl_lock();
+ handlers = rtnl_dereference(rtnl_msg_handlers[family]);
+ if (handlers) {
+ doit = READ_ONCE(handlers[type].doit);
+ if (doit)
+ err = doit(skb, nlh, extack);
+ }
+ rtnl_unlock();
+ return err;
+
+err_unlock:
+ rcu_read_unlock();
+ return -EOPNOTSUPP;
}
static void rtnetlink_rcv(struct sk_buff *skb)
{
- rtnl_lock();
netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
- rtnl_unlock();
}
static int rtnetlink_bind(struct net *net, int group)
@@ -4294,29 +4328,34 @@ static struct pernet_operations rtnetlink_net_ops = {
void __init rtnetlink_init(void)
{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++)
+ refcount_set(&rtnl_msg_handlers_ref[i], 1);
+
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
register_netdevice_notifier(&rtnetlink_dev_notifier);
rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
- rtnl_dump_ifinfo, rtnl_calcit);
- rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
+ rtnl_dump_ifinfo, 0);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
- rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
- rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+ rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0);
- rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
- rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
- rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
- NULL);
+ 0);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0f0933b338d7..917da73d3ab3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb)
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i]);
- /*
- * If skb buf is from userspace, we need to notify the caller
- * the lower device DMA has done;
- */
- if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
-
- uarg = shinfo->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, true);
- }
-
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
+ skb_zcopy_clear(skb, true);
skb_free_head(skb);
}
@@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list);
*/
void skb_tx_error(struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- struct ubuf_info *uarg;
-
- uarg = skb_shinfo(skb)->destructor_arg;
- if (uarg->callback)
- uarg->callback(uarg, false);
- skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
- }
+ skb_zcopy_clear(skb, true);
}
EXPORT_SYMBOL(skb_tx_error);
@@ -915,6 +897,273 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
+static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+{
+ unsigned long max_pg, num_pg, new_pg, old_pg;
+ struct user_struct *user;
+
+ if (capable(CAP_IPC_LOCK) || !size)
+ return 0;
+
+ num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
+ max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ user = mmp->user ? : current_user();
+
+ do {
+ old_pg = atomic_long_read(&user->locked_vm);
+ new_pg = old_pg + num_pg;
+ if (new_pg > max_pg)
+ return -ENOBUFS;
+ } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
+ old_pg);
+
+ if (!mmp->user) {
+ mmp->user = get_uid(user);
+ mmp->num_pg = num_pg;
+ } else {
+ mmp->num_pg += num_pg;
+ }
+
+ return 0;
+}
+
+static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+{
+ if (mmp->user) {
+ atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
+ free_uid(mmp->user);
+ }
+}
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+{
+ struct ubuf_info *uarg;
+ struct sk_buff *skb;
+
+ WARN_ON_ONCE(!in_task());
+
+ if (!sock_flag(sk, SOCK_ZEROCOPY))
+ return NULL;
+
+ skb = sock_omalloc(sk, 0, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+ uarg = (void *)skb->cb;
+ uarg->mmp.user = NULL;
+
+ if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ uarg->callback = sock_zerocopy_callback;
+ uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+ uarg->len = 1;
+ uarg->bytelen = size;
+ uarg->zerocopy = 1;
+ atomic_set(&uarg->refcnt, 0);
+ sock_hold(sk);
+
+ return uarg;
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+{
+ return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
+{
+ if (uarg) {
+ const u32 byte_limit = 1 << 19; /* limit to a few TSO */
+ u32 bytelen, next;
+
+ /* realloc only when socket is locked (TCP, UDP cork),
+ * so uarg->len and sk_zckey access is serialized
+ */
+ if (!sock_owned_by_user(sk)) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ bytelen = uarg->bytelen + size;
+ if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ /* TCP can create new skb to attach new uarg */
+ if (sk->sk_type == SOCK_STREAM)
+ goto new_alloc;
+ return NULL;
+ }
+
+ next = (u32)atomic_read(&sk->sk_zckey);
+ if ((u32)(uarg->id + uarg->len) == next) {
+ if (mm_account_pinned_pages(&uarg->mmp, size))
+ return NULL;
+ uarg->len++;
+ uarg->bytelen = bytelen;
+ atomic_set(&sk->sk_zckey, ++next);
+ return uarg;
+ }
+ }
+
+new_alloc:
+ return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+ u32 old_lo, old_hi;
+ u64 sum_len;
+
+ old_lo = serr->ee.ee_info;
+ old_hi = serr->ee.ee_data;
+ sum_len = old_hi - old_lo + 1ULL + len;
+
+ if (sum_len >= (1ULL << 32))
+ return false;
+
+ if (lo != old_hi + 1)
+ return false;
+
+ serr->ee.ee_data += len;
+ return true;
+}
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+ struct sk_buff *tail, *skb = skb_from_uarg(uarg);
+ struct sock_exterr_skb *serr;
+ struct sock *sk = skb->sk;
+ struct sk_buff_head *q;
+ unsigned long flags;
+ u32 lo, hi;
+ u16 len;
+
+ mm_unaccount_pinned_pages(&uarg->mmp);
+
+ /* if !len, there was only 1 call, and it was aborted
+ * so do not queue a completion notification
+ */
+ if (!uarg->len || sock_flag(sk, SOCK_DEAD))
+ goto release;
+
+ len = uarg->len;
+ lo = uarg->id;
+ hi = uarg->id + len - 1;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = 0;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+ serr->ee.ee_data = hi;
+ serr->ee.ee_info = lo;
+ if (!success)
+ serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+ q = &sk->sk_error_queue;
+ spin_lock_irqsave(&q->lock, flags);
+ tail = skb_peek_tail(q);
+ if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+ !skb_zerocopy_notify_extend(tail, lo, len)) {
+ __skb_queue_tail(q, skb);
+ skb = NULL;
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ sk->sk_error_report(sk);
+
+release:
+ consume_skb(skb);
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+{
+ if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+ if (uarg->callback)
+ uarg->callback(uarg, uarg->zerocopy);
+ else
+ consume_skb(skb_from_uarg(uarg));
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+ if (uarg) {
+ struct sock *sk = skb_from_uarg(uarg)->sk;
+
+ atomic_dec(&sk->sk_zckey);
+ uarg->len--;
+
+ /* sock_zerocopy_put expects a ref. Most sockets take one per
+ * skb, which is zero on abort. tcp_sendmsg holds one extra, to
+ * avoid an skb send inside the main loop triggering uarg free.
+ */
+ if (sk->sk_type != SOCK_STREAM)
+ atomic_inc(&uarg->refcnt);
+
+ sock_zerocopy_put(uarg);
+ }
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
+extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+ struct msghdr *msg, int len,
+ struct ubuf_info *uarg)
+{
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+ struct iov_iter orig_iter = msg->msg_iter;
+ int err, orig_len = skb->len;
+
+ /* An skb can only point to one uarg. This edge case happens when
+ * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+
+ err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+ /* Streams do not free skb on error. Reset to prev state. */
+ msg->msg_iter = orig_iter;
+ ___pskb_trim(skb, orig_len);
+ return err;
+ }
+
+ skb_zcopy_set(skb, uarg);
+ return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+ gfp_t gfp_mask)
+{
+ if (skb_zcopy(orig)) {
+ if (skb_zcopy(nskb)) {
+ /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+ if (!gfp_mask) {
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
+ }
+ if (skb_uarg(nskb) == skb_uarg(orig))
+ return 0;
+ if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+ return -EIO;
+ }
+ skb_zcopy_set(nskb, skb_uarg(orig));
+ }
+ return 0;
+}
+
/**
* skb_copy_ubufs - copy userspace skb frags buffers to kernel
* @skb: the skb to modify
@@ -932,17 +1181,19 @@ EXPORT_SYMBOL_GPL(skb_morph);
*/
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
- int i;
int num_frags = skb_shinfo(skb)->nr_frags;
struct page *page, *head = NULL;
- struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+ int i, new_frags;
+ u32 d_off;
- for (i = 0; i < num_frags; i++) {
- skb_frag_t *f = &skb_shinfo(skb)->frags[i];
- u32 p_off, p_len, copied;
- struct page *p;
- u8 *vaddr;
+ if (!num_frags)
+ return 0;
+ if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+ return -EINVAL;
+
+ new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < new_frags; i++) {
page = alloc_page(gfp_mask);
if (!page) {
while (head) {
@@ -952,33 +1203,51 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
}
return -ENOMEM;
}
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ page = head;
+ d_off = 0;
+ for (i = 0; i < num_frags; i++) {
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
p, p_off, p_len, copied) {
+ u32 copy, done = 0;
vaddr = kmap_atomic(p);
- memcpy(page_address(page) + copied, vaddr + p_off,
- p_len);
+
+ while (done < p_len) {
+ if (d_off == PAGE_SIZE) {
+ d_off = 0;
+ page = (struct page *)page_private(page);
+ }
+ copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
+ memcpy(page_address(page) + d_off,
+ vaddr + p_off + done, copy);
+ done += copy;
+ d_off += copy;
+ }
kunmap_atomic(vaddr);
}
-
- set_page_private(page, (unsigned long)head);
- head = page;
}
/* skb frags release userspace buffers */
for (i = 0; i < num_frags; i++)
skb_frag_unref(skb, i);
- uarg->callback(uarg, false);
-
/* skb frags point to kernel buffers */
- for (i = num_frags - 1; i >= 0; i--) {
- __skb_fill_page_desc(skb, i, head, 0,
- skb_shinfo(skb)->frags[i].size);
+ for (i = 0; i < new_frags - 1; i++) {
+ __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
head = (struct page *)page_private(head);
}
+ __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+ skb_shinfo(skb)->nr_frags = new_frags;
- skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ skb_zcopy_clear(skb, false);
return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
@@ -1139,7 +1408,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
if (skb_shinfo(skb)->nr_frags) {
int i;
- if (skb_orphan_frags(skb, gfp_mask)) {
+ if (skb_orphan_frags(skb, gfp_mask) ||
+ skb_zerocopy_clone(n, skb, gfp_mask)) {
kfree_skb(n);
n = NULL;
goto out;
@@ -1216,9 +1486,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
* be since all we did is relocate the values
*/
if (skb_cloned(skb)) {
- /* copy this zero copy skb frags */
if (skb_orphan_frags(skb, gfp_mask))
goto nofrags;
+ if (skb_zcopy(skb))
+ atomic_inc(&skb_uarg(skb)->refcnt);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
@@ -1713,6 +1984,9 @@ end:
skb->tail += delta;
skb->data_len -= delta;
+ if (!skb->data_len)
+ skb_zcopy_clear(skb, false);
+
return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);
@@ -2011,7 +2285,7 @@ do_frag_list:
slen = min_t(int, len, skb_headlen(skb) - offset);
kv.iov_base = skb->data + offset;
- kv.iov_len = len;
+ kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
@@ -2468,6 +2742,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
skb_tx_error(from);
return -ENOMEM;
}
+ skb_zerocopy_clone(to, from, GFP_ATOMIC);
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
if (!len)
@@ -2765,6 +3040,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
SKBTX_SHARED_FRAG;
+ skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
@@ -2808,6 +3084,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
if (skb_headlen(skb))
return 0;
+ if (skb_zcopy(tgt) || skb_zcopy(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3381,6 +3659,8 @@ normal:
skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;
+ if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
+ goto err;
while (pos < offset + len) {
if (i >= nfrags) {
@@ -4504,6 +4784,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_has_frag_list(to) || skb_has_frag_list(from))
return false;
+ if (skb_zcopy(to) || skb_zcopy(from))
+ return false;
if (skb_headlen(from) != 0) {
struct page *page;
diff --git a/net/core/sock.c b/net/core/sock.c
index 564f835f408a..0f04d8bff607 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1055,6 +1055,20 @@ set_rcvbuf:
if (val == 1)
dst_negative_advice(sk);
break;
+
+ case SO_ZEROCOPY:
+ if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ ret = -ENOTSUPP;
+ else if (sk->sk_protocol != IPPROTO_TCP)
+ ret = -ENOTSUPP;
+ else if (sk->sk_state != TCP_CLOSE)
+ ret = -EBUSY;
+ else if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1383,6 +1397,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val64 = sock_gen_cookie(sk);
break;
+ case SO_ZEROCOPY:
+ v.val = sock_flag(sk, SOCK_ZEROCOPY);
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1670,6 +1688,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_drops, 0);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
@@ -1923,6 +1942,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
}
EXPORT_SYMBOL(sock_wmalloc);
+static void sock_ofree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_omem_alloc);
+}
+
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+ gfp_t priority)
+{
+ struct sk_buff *skb;
+
+ /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
+ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
+ sysctl_optmem_max)
+ return NULL;
+
+ skb = alloc_skb(size, priority);
+ if (!skb)
+ return NULL;
+
+ atomic_add(skb->truesize, &sk->sk_omem_alloc);
+ skb->sk = sk;
+ skb->destructor = sock_ofree;
+ return skb;
+}
+
/*
* Allocate a memory block from the socket's option memory buffer.
*/
@@ -2695,6 +2741,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = SK_DEFAULT_STAMP;
+ atomic_set(&sk->sk_zckey, 0);
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;