summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c104
-rw-r--r--net/core/dev.c254
-rw-r--r--net/core/dev_ioctl.c20
-rw-r--r--net/core/devlink.c8
-rw-r--r--net/core/dst.c300
-rw-r--r--net/core/fib_rules.c27
-rw-r--r--net/core/filter.c685
-rw-r--r--net/core/flow_dissector.c69
-rw-r--r--net/core/lwt_bpf.c5
-rw-r--r--net/core/lwtunnel.c38
-rw-r--r--net/core/neighbour.c94
-rw-r--r--net/core/net-procfs.c13
-rw-r--r--net/core/net-sysfs.c16
-rw-r--r--net/core/net_namespace.c83
-rw-r--r--net/core/netpoll.c14
-rw-r--r--net/core/pktgen.c58
-rw-r--r--net/core/rtnetlink.c199
-rw-r--r--net/core/secure_seq.c9
-rw-r--r--net/core/skbuff.c193
-rw-r--r--net/core/sock.c115
-rw-r--r--net/core/sysctl_net_core.c2
21 files changed, 1580 insertions, 726 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..6877c43cc92d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
-static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
void *key)
{
unsigned long bits = (unsigned long)key;
@@ -161,6 +161,45 @@ done:
return skb;
}
+struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
+ struct sk_buff_head *queue,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
+ int *peeked, int *off, int *err,
+ struct sk_buff **last)
+{
+ struct sk_buff *skb;
+ int _off = *off;
+
+ *last = queue->prev;
+ skb_queue_walk(queue, skb) {
+ if (flags & MSG_PEEK) {
+ if (_off >= skb->len && (skb->len || _off ||
+ skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (unlikely(IS_ERR(skb))) {
+ *err = PTR_ERR(skb);
+ return NULL;
+ }
+ }
+ *peeked = 1;
+ refcount_inc(&skb->users);
+ } else {
+ __skb_unlink(skb, queue);
+ if (destructor)
+ destructor(sk, skb);
+ }
+ *off = _off;
+ return skb;
+ }
+ return NULL;
+}
+
/**
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
@@ -181,7 +220,7 @@ done:
*
* This function will lock the socket if a skb is returned, so
* the caller needs to unlock the socket in that case (usually by
- * calling skb_free_datagram). Returns NULL with *err set to
+ * calling skb_free_datagram). Returns NULL with @err set to
* -EAGAIN if no data was available or to some other value if an
* error was detected.
*
@@ -222,40 +261,14 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
* Look at current nfs client by the way...
* However, this function was correct in any case. 8)
*/
- int _off = *off;
-
- *last = (struct sk_buff *)queue;
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb_queue_walk(queue, skb) {
- *last = skb;
- if (flags & MSG_PEEK) {
- if (_off >= skb->len && (skb->len || _off ||
- skb->peeked)) {
- _off -= skb->len;
- continue;
- }
- if (!skb->len) {
- skb = skb_set_peeked(skb);
- if (IS_ERR(skb)) {
- error = PTR_ERR(skb);
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
- }
- }
- *peeked = 1;
- atomic_inc(&skb->users);
- } else {
- __skb_unlink(skb, queue);
- if (destructor)
- destructor(sk, skb);
- }
- spin_unlock_irqrestore(&queue->lock, cpu_flags);
- *off = _off;
- return skb;
- }
-
+ skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
+ peeked, off, &error, last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ if (error)
+ goto no_packet;
+ if (skb)
+ return skb;
if (!sk_can_busy_loop(sk))
break;
@@ -317,9 +330,7 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
{
bool slow;
- if (likely(atomic_read(&skb->users) == 1))
- smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users))) {
+ if (!skb_unref(skb)) {
sk_peek_offset_bwd(sk, len);
return;
}
@@ -335,8 +346,8 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
-int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
- unsigned int flags,
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
+ struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb))
{
@@ -344,15 +355,15 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
if (flags & MSG_PEEK) {
err = -ENOENT;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
+ spin_lock_bh(&sk_queue->lock);
+ if (skb == skb_peek(sk_queue)) {
+ __skb_unlink(skb, sk_queue);
+ refcount_dec(&skb->users);
if (destructor)
destructor(sk, skb);
err = 0;
}
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ spin_unlock_bh(&sk_queue->lock);
}
atomic_inc(&sk->sk_drops);
@@ -383,7 +394,8 @@ EXPORT_SYMBOL(__sk_queue_drop_skb);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- int err = __sk_queue_drop_skb(sk, skb, flags, NULL);
+ int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
+ NULL);
kfree_skb(skb);
sk_mem_reclaim_partial(sk);
@@ -602,7 +614,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
skb->data_len += copied;
skb->len += copied;
skb->truesize += truesize;
- atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
while (copied) {
int size = min_t(int, copied, PAGE_SIZE - start);
skb_fill_page_desc(skb, frag++, pages[n], start, size);
diff --git a/net/core/dev.c b/net/core/dev.c
index 96cf83da0d66..7098fba52be1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -105,6 +105,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <linux/highmem.h>
@@ -142,6 +143,7 @@
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
#include <linux/crash_dump.h>
+#include <linux/sctp.h>
#include "net-sysfs.h"
@@ -161,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
struct net_device *dev,
struct netdev_notifier_info *info);
+static struct napi_struct *napi_by_id(unsigned int napi_id);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -865,6 +868,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
EXPORT_SYMBOL(dev_get_by_index);
/**
+ * dev_get_by_napi_id - find a device by napi_id
+ * @napi_id: ID of the NAPI struct
+ *
+ * Search for an interface by NAPI ID. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not had
+ * its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_napi_id(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (napi_id < MIN_NAPI_ID)
+ return NULL;
+
+ napi = napi_by_id(napi_id);
+
+ return napi ? napi->dev : NULL;
+}
+EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
@@ -1253,8 +1281,9 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
if (!new_ifalias)
return -ENOMEM;
dev->ifalias = new_ifalias;
+ memcpy(dev->ifalias, alias, len);
+ dev->ifalias[len] = 0;
- strlcpy(dev->ifalias, alias, len+1);
return len;
}
@@ -1833,7 +1862,7 @@ static inline int deliver_skb(struct sk_buff *skb,
{
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
return -ENOMEM;
- atomic_inc(&skb->users);
+ refcount_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
@@ -2455,10 +2484,10 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
if (unlikely(!skb))
return;
- if (likely(atomic_read(&skb->users) == 1)) {
+ if (likely(refcount_read(&skb->users) == 1)) {
smp_rmb();
- atomic_set(&skb->users, 0);
- } else if (likely(!atomic_dec_and_test(&skb->users))) {
+ refcount_set(&skb->users, 0);
+ } else if (likely(!refcount_dec_and_test(&skb->users))) {
return;
}
get_kfree_skb_cb(skb)->reason = reason;
@@ -2611,6 +2640,47 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+int skb_crc32c_csum_help(struct sk_buff *skb)
+{
+ __le32 crc32c_csum;
+ int ret = 0, offset, start;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ goto out;
+
+ if (unlikely(skb_is_gso(skb)))
+ goto out;
+
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (unlikely(skb_has_shared_frag(skb))) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+ start = skb_checksum_start_offset(skb);
+ offset = start + offsetof(struct sctphdr, checksum);
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__le32))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
+ crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
+ skb->len - start, ~(__u32)0,
+ crc32c_csum_stub));
+ *(__le32 *)(skb->data + offset) = crc32c_csum;
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum_not_inet = 0;
+out:
+ return ret;
+}
+
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2953,6 +3023,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
return skb;
}
+int skb_csum_hwoffload_help(struct sk_buff *skb,
+ const netdev_features_t features)
+{
+ if (unlikely(skb->csum_not_inet))
+ return !!(features & NETIF_F_SCTP_CRC) ? 0 :
+ skb_crc32c_csum_help(skb);
+
+ return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+}
+EXPORT_SYMBOL(skb_csum_hwoffload_help);
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
netdev_features_t features;
@@ -2991,8 +3072,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
- if (!(features & NETIF_F_CSUM_MASK) &&
- skb_checksum_help(skb))
+ if (skb_csum_hwoffload_help(skb, features))
goto out_kfree_skb;
}
}
@@ -3178,7 +3258,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
qdisc_bstats_cpu_update(cl->q, skb);
- switch (tc_classify(skb, cl, &cl_res, false)) {
+ switch (tcf_classify(skb, cl, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -3190,6 +3270,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
*ret = NET_XMIT_SUCCESS;
consume_skb(skb);
return NULL;
@@ -3874,7 +3955,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
clist = clist->next;
- WARN_ON(atomic_read(&skb->users));
+ WARN_ON(refcount_read(&skb->users));
if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
trace_consume_skb(skb);
else
@@ -3948,7 +4029,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
skb->tc_at_ingress = 1;
qdisc_bstats_cpu_update(cl->q, skb);
- switch (tc_classify(skb, cl, &cl_res, false)) {
+ switch (tcf_classify(skb, cl, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -3959,6 +4040,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
@@ -4260,13 +4342,12 @@ static struct static_key generic_xdp_needed __read_mostly;
static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
{
+ struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
struct bpf_prog *new = xdp->prog;
int ret = 0;
switch (xdp->command) {
- case XDP_SETUP_PROG: {
- struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
-
+ case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);
if (old)
bpf_prog_put(old);
@@ -4278,10 +4359,10 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
dev_disable_lro(dev);
}
break;
- }
case XDP_QUERY_PROG:
- xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog);
+ xdp->prog_attached = !!old;
+ xdp->prog_id = old ? old->aux->id : 0;
break;
default:
@@ -4636,9 +4717,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (netif_elide_gro(skb->dev))
goto normal;
- if (skb->csum_bad)
- goto normal;
-
gro_list_prepare(napi, skb);
rcu_read_lock();
@@ -4766,6 +4844,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
+static void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+}
+
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
@@ -4779,13 +4864,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
- skb_dst_drop(skb);
- secpath_reset(skb);
- kmem_cache_free(skbuff_head_cache, skb);
- } else {
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
__kfree_skb(skb);
- }
break;
case GRO_HELD:
@@ -4857,10 +4939,16 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
break;
case GRO_DROP:
- case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
case GRO_MERGED:
case GRO_CONSUMED:
break;
@@ -4948,6 +5036,19 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ remsd = next;
+ }
+#endif
+}
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
@@ -4963,14 +5064,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
- while (remsd) {
- struct softnet_data *next = remsd->rps_ipi_next;
-
- if (cpu_online(remsd->cpu))
- smp_call_function_single_async(remsd->cpu,
- &remsd->csd);
- remsd = next;
- }
+ net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
@@ -5199,8 +5293,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
if (rc == BUSY_POLL_BUDGET)
__napi_schedule(napi);
local_bh_enable();
- if (local_softirq_pending())
- do_softirq();
}
void napi_busy_loop(unsigned int napi_id,
@@ -6852,6 +6944,39 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
+u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ /* Query must always succeed. */
+ WARN_ON(xdp_op(dev, &xdp) < 0);
+ if (prog_id)
+ *prog_id = xdp.prog_id;
+
+ return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+ struct netlink_ext_ack *extack, u32 flags,
+ struct bpf_prog *prog)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ if (flags & XDP_FLAGS_HW_MODE)
+ xdp.command = XDP_SETUP_PROG_HW;
+ else
+ xdp.command = XDP_SETUP_PROG;
+ xdp.extack = extack;
+ xdp.flags = flags;
+ xdp.prog = prog;
+
+ return xdp_op(dev, &xdp);
+}
+
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
@@ -6864,41 +6989,34 @@ EXPORT_SYMBOL(dev_change_proto_down);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
- int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp;
+ xdp_op_t xdp_op, xdp_chk;
int err;
ASSERT_RTNL();
- xdp_op = ops->ndo_xdp;
+ xdp_op = xdp_chk = ops->ndo_xdp;
+ if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
+ return -EOPNOTSUPP;
if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
xdp_op = generic_xdp_install;
+ if (xdp_op == xdp_chk)
+ xdp_chk = generic_xdp_install;
if (fd >= 0) {
- if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
-
- err = xdp_op(dev, &xdp);
- if (err < 0)
- return err;
- if (xdp.prog_attached)
- return -EBUSY;
- }
+ if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL))
+ return -EEXIST;
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+ __dev_xdp_attached(dev, xdp_op, NULL))
+ return -EBUSY;
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_SETUP_PROG;
- xdp.extack = extack;
- xdp.prog = prog;
-
- err = xdp_op(dev, &xdp);
+ err = dev_xdp_install(dev, xdp_op, extack, flags, prog);
if (err < 0 && prog)
bpf_prog_put(prog);
@@ -6989,7 +7107,7 @@ static void rollback_registered_many(struct list_head *head)
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL);
/*
@@ -7482,6 +7600,8 @@ out:
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
@@ -7689,8 +7809,10 @@ void netdev_run_todo(void)
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
- if (dev->destructor)
- dev->destructor(dev);
+ if (dev->priv_destructor)
+ dev->priv_destructor(dev);
+ if (dev->needs_free_netdev)
+ free_netdev(dev);
/* Report a network device has been unregistered */
rtnl_lock();
@@ -7713,7 +7835,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
{
#if BITS_PER_LONG == 64
BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
- memcpy(stats64, netdev_stats, sizeof(*stats64));
+ memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + sizeof(*netdev_stats), 0,
sizeof(*stats64) - sizeof(*netdev_stats));
@@ -7755,9 +7877,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
- storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
- storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
- storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
+ storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -8173,7 +8295,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu;
- struct softnet_data *sd, *oldsd;
+ struct softnet_data *sd, *oldsd, *remsd = NULL;
local_irq_disable();
cpu = smp_processor_id();
@@ -8214,6 +8336,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
@@ -8563,7 +8692,6 @@ static int __init net_dev_init(void)
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
- dst_subsys_init();
rc = 0;
out:
return rc;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b94b1d293506..82fd4c9c4a1b 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -225,6 +225,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_PTP_V2_EVENT:
case HWTSTAMP_FILTER_PTP_V2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
}
@@ -410,6 +411,22 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCGIFNAME)
return dev_ifname(net, (struct ifreq __user *)arg);
+ /*
+ * Take care of Wireless Extensions. Unfortunately struct iwreq
+ * isn't a proper subset of struct ifreq (it's 8 byte shorter)
+ * so we need to treat it specially, otherwise applications may
+ * fault if the struct they're passing happens to land at the
+ * end of a mapped page.
+ */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ struct iwreq iwr;
+
+ if (copy_from_user(&iwr, arg, sizeof(iwr)))
+ return -EFAULT;
+
+ return wext_handle_ioctl(net, &iwr, cmd, arg);
+ }
+
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
@@ -559,9 +576,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ret = -EFAULT;
return ret;
}
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
- return wext_handle_ioctl(net, &ifr, cmd, arg);
return -ENOTTY;
}
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b0b87a292e7c..a0adfc31a3fe 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1680,8 +1680,10 @@ start_again:
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
@@ -2098,8 +2100,10 @@ start_again:
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr)
+ if (!hdr) {
+ nlmsg_free(skb);
return -EMSGSIZE;
+ }
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
diff --git a/net/core/dst.c b/net/core/dst.c
index 960e503b5a52..00aa972ad1a1 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -42,108 +42,6 @@
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
-static struct {
- spinlock_t lock;
- struct dst_entry *list;
- unsigned long timer_inc;
- unsigned long timer_expires;
-} dst_garbage = {
- .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
- .timer_inc = DST_GC_MAX,
-};
-static void dst_gc_task(struct work_struct *work);
-static void ___dst_free(struct dst_entry *dst);
-
-static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
-
-static DEFINE_MUTEX(dst_gc_mutex);
-/*
- * long lived entries are maintained in this list, guarded by dst_gc_mutex
- */
-static struct dst_entry *dst_busy_list;
-
-static void dst_gc_task(struct work_struct *work)
-{
- int delayed = 0;
- int work_performed = 0;
- unsigned long expires = ~0L;
- struct dst_entry *dst, *next, head;
- struct dst_entry *last = &head;
-
- mutex_lock(&dst_gc_mutex);
- next = dst_busy_list;
-
-loop:
- while ((dst = next) != NULL) {
- next = dst->next;
- prefetch(&next->next);
- cond_resched();
- if (likely(atomic_read(&dst->__refcnt))) {
- last->next = dst;
- last = dst;
- delayed++;
- continue;
- }
- work_performed++;
-
- dst = dst_destroy(dst);
- if (dst) {
- /* NOHASH and still referenced. Unless it is already
- * on gc list, invalidate it and add to gc list.
- *
- * Note: this is temporary. Actually, NOHASH dst's
- * must be obsoleted when parent is obsoleted.
- * But we do not have state "obsoleted, but
- * referenced by parent", so it is right.
- */
- if (dst->obsolete > 0)
- continue;
-
- ___dst_free(dst);
- dst->next = next;
- next = dst;
- }
- }
-
- spin_lock_bh(&dst_garbage.lock);
- next = dst_garbage.list;
- if (next) {
- dst_garbage.list = NULL;
- spin_unlock_bh(&dst_garbage.lock);
- goto loop;
- }
- last->next = NULL;
- dst_busy_list = head.next;
- if (!dst_busy_list)
- dst_garbage.timer_inc = DST_GC_MAX;
- else {
- /*
- * if we freed less than 1/10 of delayed entries,
- * we can sleep longer.
- */
- if (work_performed <= delayed/10) {
- dst_garbage.timer_expires += dst_garbage.timer_inc;
- if (dst_garbage.timer_expires > DST_GC_MAX)
- dst_garbage.timer_expires = DST_GC_MAX;
- dst_garbage.timer_inc += DST_GC_INC;
- } else {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- }
- expires = dst_garbage.timer_expires;
- /*
- * if the next desired timer is more than 4 seconds in the
- * future then round the timer to whole seconds
- */
- if (expires > 4*HZ)
- expires = round_jiffies_relative(expires);
- schedule_delayed_work(&dst_gc_work, expires);
- }
-
- spin_unlock_bh(&dst_garbage.lock);
- mutex_unlock(&dst_gc_mutex);
-}
-
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -151,13 +49,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dst_discard_out);
-const u32 dst_default_metrics[RTAX_MAX + 1] = {
+const struct dst_metrics dst_default_metrics = {
/* This initializer is needed to force linker to place this variable
* into const section. Otherwise it might end into bss section.
* We really want to avoid false sharing on this variable, and catch
* any writes on it.
*/
- [RTAX_MAX] = 0xdeadbeef,
+ .refcnt = ATOMIC_INIT(1),
};
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
@@ -169,7 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
if (dev)
dev_hold(dev);
dst->ops = ops;
- dst_init_metrics(dst, dst_default_metrics, true);
+ dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
dst->path = dst;
dst->from = NULL;
@@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
-static void ___dst_free(struct dst_entry *dst)
-{
- /* The first case (dev==NULL) is required, when
- protocol module is unloaded.
- */
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- }
- dst->obsolete = DST_OBSOLETE_DEAD;
-}
-
-void __dst_free(struct dst_entry *dst)
-{
- spin_lock_bh(&dst_garbage.lock);
- ___dst_free(dst);
- dst->next = dst_garbage.list;
- dst_garbage.list = dst;
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-}
-EXPORT_SYMBOL(__dst_free);
-
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
-again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
@@ -269,20 +138,8 @@ again:
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
- if (dst) {
- int nohash = dst->flags & DST_NOHASH;
-
- if (atomic_dec_and_test(&dst->__refcnt)) {
- /* We were real parent of this dst, so kill child. */
- if (nohash)
- goto again;
- } else {
- /* Child is still referenced, return it for freeing. */
- if (nohash)
- return dst;
- /* Child is still in his hash table */
- }
- }
+ if (dst)
+ dst_release_immediate(dst);
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
@@ -292,47 +149,88 @@ static void dst_destroy_rcu(struct rcu_head *head)
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
}
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under loopback interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, true);
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+}
+EXPORT_SYMBOL(dst_dev_put);
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
- unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (!newrefcnt && unlikely(nocache))
+ if (!newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
+void dst_release_immediate(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ dst_destroy(dst);
+ }
+}
+EXPORT_SYMBOL(dst_release_immediate);
+
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
- u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+ struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
if (p) {
- u32 *old_p = __DST_METRICS_PTR(old);
+ struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
unsigned long prev, new;
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+ atomic_set(&p->refcnt, 1);
+ memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
new = (unsigned long) p;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev != old) {
kfree(p);
- p = __DST_METRICS_PTR(prev);
+ p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
if (prev & DST_METRICS_READ_ONLY)
p = NULL;
+ } else if (prev & DST_METRICS_REFCOUNTED) {
+ if (atomic_dec_and_test(&old_p->refcnt))
+ kfree(old_p);
}
}
- return p;
+ BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
+ return (u32 *)p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);
@@ -341,7 +239,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
unsigned long prev, new;
- new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+ new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev == old)
kfree(__DST_METRICS_PTR(old));
@@ -366,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb)
return 0;
}
-static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
+static void __metadata_dst_init(struct metadata_dst *md_dst,
+ enum metadata_type type, u8 optslen)
+
{
struct dst_entry *dst;
dst = &md_dst->dst;
dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
- DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+ DST_METADATA | DST_NOCOUNT);
dst->input = dst_md_discard;
dst->output = dst_md_discard_out;
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->type = type;
}
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+ gfp_t flags)
{
struct metadata_dst *md_dst;
@@ -388,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
if (!md_dst)
return NULL;
- __metadata_dst_init(md_dst, optslen);
+ __metadata_dst_init(md_dst, type, optslen);
return md_dst;
}
@@ -402,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
kfree(md_dst);
}
-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
int cpu;
struct metadata_dst __percpu *md_dst;
@@ -413,77 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
return NULL;
for_each_possible_cpu(cpu)
- __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
-
-/* Dirty hack. We did it in 2.2 (in __dst_free),
- * we have _very_ good reasons not to repeat
- * this mistake in 2.3, but we have no choice
- * now. _It_ _is_ _explicit_ _deliberate_
- * _race_ _condition_.
- *
- * Commented and originally written by Alexey.
- */
-static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, unregister);
-
- if (dev != dst->dev)
- return;
-
- if (!unregister) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- } else {
- dst->dev = dev_net(dst->dev)->loopback_dev;
- dev_hold(dst->dev);
- dev_put(dev);
- }
-}
-
-static int dst_dev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dst_entry *dst, *last = NULL;
-
- switch (event) {
- case NETDEV_UNREGISTER_FINAL:
- case NETDEV_DOWN:
- mutex_lock(&dst_gc_mutex);
- for (dst = dst_busy_list; dst; dst = dst->next) {
- last = dst;
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- }
-
- spin_lock_bh(&dst_garbage.lock);
- dst = dst_garbage.list;
- dst_garbage.list = NULL;
- spin_unlock_bh(&dst_garbage.lock);
-
- if (last)
- last->next = dst;
- else
- dst_busy_list = dst;
- for (; dst; dst = dst->next)
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- mutex_unlock(&dst_gc_mutex);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dst_dev_notifier = {
- .notifier_call = dst_dev_event,
- .priority = -10, /* must be called after other network notifiers */
-};
-
-void __init dst_subsys_init(void)
-{
- register_netdevice_notifier(&dst_dev_notifier);
-}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f21c4d3aeae0..a0093e1b0235 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -46,7 +46,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
if (r == NULL)
return -ENOMEM;
- atomic_set(&r->refcnt, 1);
+ refcount_set(&r->refcnt, 1);
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
@@ -283,7 +283,7 @@ jumped:
if (err != -EAGAIN) {
if ((arg->flags & FIB_LOOKUP_NOREF) ||
- likely(atomic_inc_not_zero(&rule->refcnt))) {
+ likely(refcount_inc_not_zero(&rule->refcnt))) {
arg->rule = rule;
goto out;
}
@@ -517,7 +517,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
last = r;
}
- fib_rule_get(rule);
+ refcount_set(&rule->refcnt, 1);
if (last)
list_add_rcu(&rule->list, &last->list);
@@ -568,7 +568,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *tmp;
+ struct fib_rule *rule, *r;
struct nlattr *tb[FRA_MAX+1];
struct fib_kuid_range range;
int err = -EINVAL;
@@ -668,16 +668,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
/*
* Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
* disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules have
- * actually been added.
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- list_for_each_entry(tmp, &ops->rules_list, list) {
- if (rtnl_dereference(tmp->ctarget) == rule) {
- RCU_INIT_POINTER(tmp->ctarget, NULL);
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
ops->unresolved_rules++;
- }
}
}
diff --git a/net/core/filter.c b/net/core/filter.c
index a253a6197e6b..c7f737058d89 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
+#include <net/tcp.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -352,7 +353,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* bpf_convert_filter - convert filter program
* @prog: the user passed filter program
* @len: the length of the user passed filter program
- * @new_prog: buffer where converted program will be stored
+ * @new_prog: allocated 'struct bpf_prog' or NULL
* @new_len: pointer to store length of converted program
*
* Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
@@ -364,14 +365,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
*
* 2) 2nd pass to remap in two passes: 1st pass finds new
* jump offsets, 2nd pass remapping:
- * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
* bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_insn *new_prog, int *new_len)
+ struct bpf_prog *new_prog, int *new_len)
{
- int new_flen = 0, pass = 0, target, i;
- struct bpf_insn *new_insn;
+ int new_flen = 0, pass = 0, target, i, stack_off;
+ struct bpf_insn *new_insn, *first_insn = NULL;
struct sock_filter *fp;
int *addrs = NULL;
u8 bpf_src;
@@ -383,6 +383,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,
return -EINVAL;
if (new_prog) {
+ first_insn = new_prog->insnsi;
addrs = kcalloc(len, sizeof(*addrs),
GFP_KERNEL | __GFP_NOWARN);
if (!addrs)
@@ -390,11 +391,11 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,
}
do_pass:
- new_insn = new_prog;
+ new_insn = first_insn;
fp = prog;
/* Classic BPF related prologue emission. */
- if (new_insn) {
+ if (new_prog) {
/* Classic BPF expects A and X to be reset first. These need
* to be guaranteed to be the first two instructions.
*/
@@ -415,7 +416,7 @@ do_pass:
struct bpf_insn *insn = tmp_insns;
if (addrs)
- addrs[i] = new_insn - new_prog;
+ addrs[i] = new_insn - first_insn;
switch (fp->code) {
/* All arithmetic insns and skb loads map as-is. */
@@ -561,17 +562,25 @@ do_pass:
/* Store to stack. */
case BPF_ST:
case BPF_STX:
+ stack_off = fp->k * 4 + 4;
*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
BPF_ST ? BPF_REG_A : BPF_REG_X,
- -(BPF_MEMWORDS - fp->k) * 4);
+ -stack_off);
+ /* check_load_and_stores() verifies that classic BPF can
+ * load from stack only after write, so tracking
+ * stack_depth for ST|STX insns is enough
+ */
+ if (new_prog && new_prog->aux->stack_depth < stack_off)
+ new_prog->aux->stack_depth = stack_off;
break;
/* Load from stack. */
case BPF_LD | BPF_MEM:
case BPF_LDX | BPF_MEM:
+ stack_off = fp->k * 4 + 4;
*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
BPF_REG_A : BPF_REG_X, BPF_REG_FP,
- -(BPF_MEMWORDS - fp->k) * 4);
+ -stack_off);
break;
/* A = K or X = K */
@@ -619,13 +628,13 @@ do_pass:
if (!new_prog) {
/* Only calculating new length. */
- *new_len = new_insn - new_prog;
+ *new_len = new_insn - first_insn;
return 0;
}
pass++;
- if (new_flen != new_insn - new_prog) {
- new_flen = new_insn - new_prog;
+ if (new_flen != new_insn - first_insn) {
+ new_flen = new_insn - first_insn;
if (pass > 2)
goto err;
goto do_pass;
@@ -1017,7 +1026,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
fp->len = new_len;
/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
- err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
if (err)
/* 2nd bpf_convert_filter() can fail only if it fails
* to allocate memory, remapping must succeed. Note,
@@ -1866,6 +1875,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
+{
+ /* Set user specified hash as L4(+), so that it gets returned
+ * on skb_get_hash() call unless BPF prog later on triggers a
+ * skb_clear_hash().
+ */
+ __skb_set_sw_hash(skb, hash, true);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_proto = {
+ .func = bpf_set_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
u16, vlan_tci)
{
@@ -1985,7 +2012,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
- u32 off = skb->network_header - skb->mac_header;
+ u32 off = skb_mac_header_len(skb);
int ret;
ret = skb_cow(skb, len_diff);
@@ -2021,7 +2048,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
- u32 off = skb->network_header - skb->mac_header;
+ u32 off = skb_mac_header_len(skb);
int ret;
ret = skb_unclone(skb, GFP_ATOMIC);
@@ -2127,6 +2154,124 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
.arg2_type = ARG_ANYTHING,
};
+static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return sizeof(struct iphdr);
+ case htons(ETH_P_IPV6):
+ return sizeof(struct ipv6hdr);
+ default:
+ return ~0U;
+ }
+}
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
+{
+ u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+ int ret;
+
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* Due to header grow, MSS needs to be downgraded. */
+ skb_shinfo(skb)->gso_size -= len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ return 0;
+}
+
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
+{
+ u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+ int ret;
+
+ ret = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (skb_is_gso(skb)) {
+ /* Due to header shrink, MSS can be upgraded. */
+ skb_shinfo(skb)->gso_size += len_diff;
+ /* Header must be checked, and gso_segs recomputed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
+ return 0;
+}
+
+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+{
+ return skb->dev->mtu + skb->dev->hard_header_len;
+}
+
+static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
+{
+ bool trans_same = skb->transport_header == skb->network_header;
+ u32 len_cur, len_diff_abs = abs(len_diff);
+ u32 len_min = bpf_skb_net_base_len(skb);
+ u32 len_max = __bpf_skb_max_len(skb);
+ __be16 proto = skb->protocol;
+ bool shrink = len_diff < 0;
+ int ret;
+
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+ if (unlikely(proto != htons(ETH_P_IP) &&
+ proto != htons(ETH_P_IPV6)))
+ return -ENOTSUPP;
+
+ len_cur = skb->len - skb_network_offset(skb);
+ if (skb_transport_header_was_set(skb) && !trans_same)
+ len_cur = skb_network_header_len(skb);
+ if ((shrink && (len_diff_abs >= len_cur ||
+ len_cur - len_diff_abs < len_min)) ||
+ (!shrink && (skb->len + len_diff_abs > len_max &&
+ !skb_is_gso(skb))))
+ return -ENOTSUPP;
+
+ ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
+ bpf_skb_net_grow(skb, len_diff_abs);
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (likely(mode == BPF_ADJ_ROOM_NET))
+ return bpf_skb_adjust_net(skb, len_diff);
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
+ .func = bpf_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
u32 min_len = skb_network_offset(skb);
@@ -2139,11 +2284,6 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb)
return min_len;
}
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
-{
- return skb->dev->mtu + skb->dev->hard_header_len;
-}
-
static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
unsigned int old_len = skb->len;
@@ -2280,7 +2420,9 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_proto ||
func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
+ func == bpf_skb_adjust_room ||
func == bpf_skb_pull_data ||
+ func == bpf_clone_redirect ||
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head)
@@ -2538,6 +2680,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
* that is holding verifier mutex.
*/
md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ METADATA_IP_TUNNEL,
GFP_KERNEL);
if (!md_dst)
return NULL;
@@ -2644,6 +2787,110 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ struct sock *sk = bpf_sock->sk;
+ int ret = 0;
+ int val;
+
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET) {
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ val = *((int *)optval);
+
+ /* Only some socketops are supported */
+ switch (optname) {
+ case SO_RCVBUF:
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ break;
+ case SO_SNDBUF:
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ break;
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+ case SO_PRIORITY:
+ sk->sk_priority = val;
+ break;
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+ case SO_MARK:
+ sk->sk_mark = val;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+#ifdef CONFIG_INET
+ } else if (level == SOL_TCP &&
+ sk->sk_prot->setsockopt == tcp_setsockopt) {
+ if (optname == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+
+ strncpy(name, optval, min_t(long, optlen,
+ TCP_CA_NAME_MAX-1));
+ name[TCP_CA_NAME_MAX-1] = 0;
+ ret = tcp_set_congestion_control(sk, name, false);
+ if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+ /* replacing an existing ca */
+ tcp_reinit_congestion_control(sk,
+ inet_csk(sk)->icsk_ca_ops);
+ } else {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > 0)
+ ret = -EINVAL;
+ else
+ tp->snd_cwnd = val;
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0) {
+ ret = -EINVAL;
+ } else {
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ }
+ ret = -EINVAL;
+#endif
+ } else {
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+ .func = bpf_setsockopt,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -2717,6 +2964,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_change_proto_proto;
case BPF_FUNC_skb_change_type:
return &bpf_skb_change_type_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &bpf_skb_adjust_room_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
case BPF_FUNC_skb_get_tunnel_key:
@@ -2735,6 +2984,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_get_hash_recalc_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_set_hash:
+ return &bpf_set_hash_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -2766,12 +3017,6 @@ xdp_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-cg_skb_func_proto(enum bpf_func_id func_id)
-{
- return sk_filter_func_proto(func_id);
-}
-
-static const struct bpf_func_proto *
lwt_inout_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -2799,6 +3044,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
+ sock_ops_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_setsockopt_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -2833,8 +3089,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
}
}
-static bool __is_valid_access(int off, int size)
+static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
{
+ const int size_default = sizeof(__u32);
+
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -2843,15 +3102,25 @@ static bool __is_valid_access(int off, int size)
return false;
switch (off) {
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
- if (off + size >
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
- default:
- if (size != sizeof(__u32))
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (size != size_default)
return false;
+ break;
+ default:
+ /* Only narrow read access allowed for now. */
+ if (type == BPF_WRITE) {
+ if (size != size_default)
+ return false;
+ } else {
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ }
}
return true;
@@ -2859,43 +3128,41 @@ static bool __is_valid_access(int off, int size)
static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
switch (off) {
- case offsetof(struct __sk_buff, tc_classid):
- case offsetof(struct __sk_buff, data):
- case offsetof(struct __sk_buff, data_end):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
- return __is_valid_access(off, size);
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
switch (off) {
- case offsetof(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, mark):
- case offsetof(struct __sk_buff, priority):
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
@@ -2903,20 +3170,20 @@ static bool lwt_is_valid_access(int off, int size,
}
switch (off) {
- case offsetof(struct __sk_buff, data):
- *reg_type = PTR_TO_PACKET;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
break;
- case offsetof(struct __sk_buff, data_end):
- *reg_type = PTR_TO_PACKET_END;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
break;
}
- return __is_valid_access(off, size);
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool sock_filter_is_valid_access(int off, int size,
enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE) {
switch (off) {
@@ -2979,16 +3246,15 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, mark):
- case offsetof(struct __sk_buff, tc_index):
- case offsetof(struct __sk_buff, priority):
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
@@ -2996,15 +3262,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,
}
switch (off) {
- case offsetof(struct __sk_buff, data):
- *reg_type = PTR_TO_PACKET;
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
break;
- case offsetof(struct __sk_buff, data_end):
- *reg_type = PTR_TO_PACKET_END;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
break;
}
- return __is_valid_access(off, size);
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool __is_valid_xdp_access(int off, int size)
@@ -3021,17 +3287,17 @@ static bool __is_valid_xdp_access(int off, int size)
static bool xdp_is_valid_access(int off, int size,
enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE)
return false;
switch (off) {
case offsetof(struct xdp_md, data):
- *reg_type = PTR_TO_PACKET;
+ info->reg_type = PTR_TO_PACKET;
break;
case offsetof(struct xdp_md, data_end):
- *reg_type = PTR_TO_PACKET_END;
+ info->reg_type = PTR_TO_PACKET_END;
break;
}
@@ -3044,101 +3310,141 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static bool __is_valid_sock_ops_access(int off, int size)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock_ops, op) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_sock_ops_access(off, size);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
int off;
switch (si->off) {
case offsetof(struct __sk_buff, len):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, len));
+ bpf_target_off(struct sk_buff, len, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, protocol):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
-
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, protocol));
+ bpf_target_off(struct sk_buff, protocol, 2,
+ target_size));
break;
case offsetof(struct __sk_buff, vlan_proto):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
-
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, vlan_proto));
+ bpf_target_off(struct sk_buff, vlan_proto, 2,
+ target_size));
break;
case offsetof(struct __sk_buff, priority):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, priority));
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, priority));
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, ingress_ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, skb_iif));
+ bpf_target_off(struct sk_buff, skb_iif, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
- offsetof(struct net_device, ifindex));
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, hash):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, hash));
+ bpf_target_off(struct sk_buff, hash, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, mark):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, mark));
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, mark));
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, pkt_type):
- return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
- si->src_reg, insn);
+ *target_size = 1;
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+ PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
+#endif
+ break;
case offsetof(struct __sk_buff, queue_mapping):
- return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
- si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, queue_mapping, 2,
+ target_size));
+ break;
case offsetof(struct __sk_buff, vlan_present):
- return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
- si->dst_reg, si->src_reg, insn);
-
case offsetof(struct __sk_buff, vlan_tci):
- return convert_skb_access(SKF_AD_VLAN_TAG,
- si->dst_reg, si->src_reg, insn);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_tci, 2,
+ target_size));
+ if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+ }
+ break;
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ offsetofend(struct __sk_buff, cb[4]) - 1:
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
offsetof(struct qdisc_skb_cb, data)) %
@@ -3164,6 +3470,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off -= offsetof(struct __sk_buff, tc_classid);
off += offsetof(struct sk_buff, cb);
off += offsetof(struct qdisc_skb_cb, tc_classid);
+ *target_size = 2;
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
si->src_reg, off);
@@ -3189,14 +3496,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, tc_index));
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, tc_index));
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
#else
if (type == BPF_WRITE)
*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
@@ -3207,10 +3514,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, napi_id));
+ bpf_target_off(struct sk_buff, napi_id, 4,
+ target_size));
*insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
@@ -3225,7 +3531,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
@@ -3269,22 +3575,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct __sk_buff, ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
- offsetof(struct net_device, ifindex));
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
break;
default:
- return bpf_convert_ctx_access(type, si, insn_buf, prog);
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
}
return insn - insn_buf;
@@ -3293,7 +3599,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
static u32 xdp_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
@@ -3313,6 +3619,139 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_ops, op) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, op);
+ off += offsetof(struct bpf_sock_ops_kern, op);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ break;
+
+ case offsetof(struct bpf_sock_ops, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+ }
+ return insn - insn_buf;
+}
+
const struct bpf_verifier_ops sk_filter_prog_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -3335,7 +3774,7 @@ const struct bpf_verifier_ops xdp_prog_ops = {
};
const struct bpf_verifier_ops cg_skb_prog_ops = {
- .get_func_proto = cg_skb_func_proto,
+ .get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
.test_run = bpf_prog_test_run_skb,
@@ -3362,6 +3801,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
.convert_ctx_access = sock_filter_convert_ctx_access,
};
+const struct bpf_verifier_ops sock_ops_prog_ops = {
+ .get_func_proto = sock_ops_func_proto,
+ .is_valid_access = sock_ops_is_valid_access,
+ .convert_ctx_access = sock_ops_convert_ctx_access,
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 28d94bce4df8..fc5fc4594c90 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -18,6 +18,7 @@
#include <linux/stddef.h>
#include <linux/if_ether.h>
#include <linux/mpls.h>
+#include <linux/tcp.h>
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
@@ -342,6 +343,64 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;
}
+static void
+__skb_flow_dissect_tcp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, int thoff, int hlen)
+{
+ struct flow_dissector_key_tcp *key_tcp;
+ struct tcphdr *th, _th;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
+ return;
+
+ th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
+ if (!th)
+ return;
+
+ if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
+ return;
+
+ key_tcp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_TCP,
+ target_container);
+ key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
+}
+
+static void
+__skb_flow_dissect_ipv4(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, const struct iphdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = iph->tos;
+ key_ip->ttl = iph->ttl;
+}
+
+static void
+__skb_flow_dissect_ipv6(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, const struct ipv6hdr *iph)
+{
+ struct flow_dissector_key_ip *key_ip;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
+ return;
+
+ key_ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IP,
+ target_container);
+ key_ip->tos = ipv6_get_dsfield(iph);
+ key_ip->ttl = iph->hop_limit;
+}
+
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -444,6 +503,9 @@ ip:
}
}
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
goto out_good;
@@ -489,6 +551,9 @@ ipv6:
goto out_good;
}
+ __skb_flow_dissect_ipv6(skb, flow_dissector,
+ target_container, data, iph);
+
if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
goto out_good;
@@ -683,6 +748,10 @@ ip_proto_again:
case IPPROTO_MPLS:
proto = htons(ETH_P_MPLS_UC);
goto mpls;
+ case IPPROTO_TCP:
+ __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
default:
break;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index b3bc0a31af9f..1307731ddfe4 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -240,7 +240,8 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
static int bpf_build_state(struct nlattr *nla,
unsigned int family, const void *cfg,
- struct lwtunnel_state **ts)
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[LWT_BPF_MAX + 1];
struct lwtunnel_state *newts;
@@ -250,7 +251,7 @@ static int bpf_build_state(struct nlattr *nla,
if (family != AF_INET && family != AF_INET6)
return -EAFNOSUPPORT;
- ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL);
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
if (ret < 0)
return ret;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index cfae3d5fe11f..d9cb3532f1dd 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -103,37 +103,53 @@ EXPORT_SYMBOL(lwtunnel_encap_del_ops);
int lwtunnel_build_state(u16 encap_type,
struct nlattr *encap, unsigned int family,
- const void *cfg, struct lwtunnel_state **lws)
+ const void *cfg, struct lwtunnel_state **lws,
+ struct netlink_ext_ack *extack)
{
const struct lwtunnel_encap_ops *ops;
+ bool found = false;
int ret = -EINVAL;
if (encap_type == LWTUNNEL_ENCAP_NONE ||
- encap_type > LWTUNNEL_ENCAP_MAX)
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "Unknown LWT encapsulation type");
return ret;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
- ret = ops->build_state(encap, family, cfg, lws);
+ found = true;
+ ret = ops->build_state(encap, family, cfg, lws, extack);
if (ret)
module_put(ops->owner);
}
rcu_read_unlock();
+ /* don't rely on -EOPNOTSUPP to detect match as build_state
+ * handlers could return it
+ */
+ if (!found) {
+ NL_SET_ERR_MSG_ATTR(extack, encap,
+ "LWT encapsulation type not supported");
+ }
+
return ret;
}
EXPORT_SYMBOL(lwtunnel_build_state);
-int lwtunnel_valid_encap_type(u16 encap_type)
+int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
{
const struct lwtunnel_encap_ops *ops;
int ret = -EINVAL;
if (encap_type == LWTUNNEL_ENCAP_NONE ||
- encap_type > LWTUNNEL_ENCAP_MAX)
+ encap_type > LWTUNNEL_ENCAP_MAX) {
+ NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type");
return ret;
+ }
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
@@ -153,11 +169,16 @@ int lwtunnel_valid_encap_type(u16 encap_type)
}
}
#endif
- return ops ? 0 : -EOPNOTSUPP;
+ ret = ops ? 0 : -EOPNOTSUPP;
+ if (ret < 0)
+ NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported");
+
+ return ret;
}
EXPORT_SYMBOL(lwtunnel_valid_encap_type);
-int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
+ struct netlink_ext_ack *extack)
{
struct rtnexthop *rtnh = (struct rtnexthop *)attr;
struct nlattr *nla_entype;
@@ -174,7 +195,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
if (nla_entype) {
encap_type = nla_get_u16(nla_entype);
- if (lwtunnel_valid_encap_type(encap_type) != 0)
+ if (lwtunnel_valid_encap_type(encap_type,
+ extack) != 0)
return -EOPNOTSUPP;
}
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 58b0bcc125b5..e31fc11a8000 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -118,6 +118,50 @@ unsigned long neigh_rand_reach_time(unsigned long base)
EXPORT_SYMBOL(neigh_rand_reach_time);
+static bool neigh_del(struct neighbour *n, __u8 state,
+ struct neighbour __rcu **np, struct neigh_table *tbl)
+{
+ bool retval = false;
+
+ write_lock(&n->lock);
+ if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) {
+ struct neighbour *neigh;
+
+ neigh = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+ rcu_assign_pointer(*np, neigh);
+ n->dead = 1;
+ retval = true;
+ }
+ write_unlock(&n->lock);
+ if (retval)
+ neigh_cleanup_and_release(n);
+ return retval;
+}
+
+bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
+{
+ struct neigh_hash_table *nht;
+ void *pkey = ndel->primary_key;
+ u32 hash_val;
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
+ hash_val = hash_val >> (32 - nht->hash_shift);
+
+ np = &nht->hash_buckets[hash_val];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock)))) {
+ if (n == ndel)
+ return neigh_del(n, 0, np, tbl);
+ np = &n->next;
+ }
+ return false;
+}
+
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
@@ -140,19 +184,10 @@ static int neigh_forced_gc(struct neigh_table *tbl)
* - nobody refers to it.
* - it is not permanent
*/
- write_lock(&n->lock);
- if (atomic_read(&n->refcnt) == 1 &&
- !(n->nud_state & NUD_PERMANENT)) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- n->dead = 1;
- shrunk = 1;
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
+ if (neigh_del(n, NUD_PERMANENT, np, tbl)) {
+ shrunk = 1;
continue;
}
- write_unlock(&n->lock);
np = &n->next;
}
}
@@ -219,7 +254,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
neigh_del_timer(n);
n->dead = 1;
- if (atomic_read(&n->refcnt) != 1) {
+ if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
but someone still uses it.
@@ -300,7 +335,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
- atomic_set(&n->refcnt, 1);
+ refcount_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
@@ -409,7 +444,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
rcu_read_lock_bh();
n = __neigh_lookup_noref(tbl, pkey, dev);
if (n) {
- if (!atomic_inc_not_zero(&n->refcnt))
+ if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
}
@@ -438,7 +473,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
n = rcu_dereference_bh(n->next)) {
if (!memcmp(n->primary_key, pkey, key_len) &&
net_eq(dev_net(n->dev), net)) {
- if (!atomic_inc_not_zero(&n->refcnt))
+ if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
@@ -674,7 +709,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms);
static inline void neigh_parms_put(struct neigh_parms *parms)
{
- if (atomic_dec_and_test(&parms->refcnt))
+ if (refcount_dec_and_test(&parms->refcnt))
neigh_parms_destroy(parms);
}
@@ -786,7 +821,7 @@ static void neigh_periodic_work(struct work_struct *work)
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
- if (atomic_read(&n->refcnt) == 1 &&
+ if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
@@ -1132,10 +1167,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
lladdr = neigh->ha;
}
- if (new & NUD_CONNECTED)
- neigh->confirmed = jiffies;
- neigh->updated = jiffies;
-
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
@@ -1157,6 +1188,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
}
}
+ /* Update timestamps only once we know we will make a change to the
+ * neighbour entry. Otherwise we risk to move the locktime window with
+ * noop updates and ignore relevant ARP updates.
+ */
+ if (new != old || lladdr != neigh->ha) {
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+ }
+
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
@@ -1438,7 +1479,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
- atomic_set(&p->refcnt, 1);
+ refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
dev_hold(dev);
@@ -1501,7 +1542,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
INIT_LIST_HEAD(&tbl->parms_list);
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
- atomic_set(&tbl->parms.refcnt, 1);
+ refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
@@ -1643,7 +1684,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN,
NETLINK_CB(skb).portid);
+ write_lock_bh(&tbl->lock);
neigh_release(neigh);
+ neigh_remove_one(neigh, tbl);
+ write_unlock_bh(&tbl->lock);
out:
return err;
@@ -1752,7 +1796,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
if ((parms->dev &&
nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
- nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
/* approximative value for deprecated QUEUE_LEN (in packets) */
@@ -2190,7 +2234,7 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
- ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 14d09345f00d..4847964931df 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -363,15 +363,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
netif_addr_lock_bh(dev);
netdev_for_each_mc_addr(ha, dev) {
- int i;
-
- seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
- dev->name, ha->refcount, ha->global_use);
-
- for (i = 0; i < dev->addr_len; i++)
- seq_printf(seq, "%02x", ha->addr[i]);
-
- seq_putc(seq, '\n');
+ seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n",
+ dev->ifindex, dev->name,
+ ha->refcount, ha->global_use,
+ (int)dev->addr_len, ha->addr);
}
netif_addr_unlock_bh(dev);
return 0;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 65ea0ff4017c..b4f9922b6f23 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -323,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex);
static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- int res, orig_len = dev->tx_queue_len;
+ unsigned int orig_len = dev->tx_queue_len;
+ int res;
+
+ if (new_len != (unsigned int)new_len)
+ return -ERANGE;
if (new_len != orig_len) {
dev->tx_queue_len = new_len;
@@ -349,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev,
return netdev_store(dev, attr, buf, len, change_tx_queue_len);
}
-NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
@@ -622,7 +626,7 @@ static struct attribute *netstat_attrs[] = {
};
-static struct attribute_group netstat_group = {
+static const struct attribute_group netstat_group = {
.name = "statistics",
.attrs = netstat_attrs,
};
@@ -632,7 +636,7 @@ static struct attribute *wireless_attrs[] = {
NULL
};
-static struct attribute_group wireless_group = {
+static const struct attribute_group wireless_group = {
.name = "wireless",
.attrs = wireless_attrs,
};
@@ -1200,7 +1204,7 @@ static struct attribute *dql_attrs[] = {
NULL
};
-static struct attribute_group dql_group = {
+static const struct attribute_group dql_group = {
.name = "byte_queue_limits",
.attrs = dql_attrs,
};
@@ -1444,7 +1448,7 @@ static void *net_grab_current_ns(void)
struct net *ns = current->nsproxy->net_ns;
#ifdef CONFIG_NET_NS
if (ns)
- atomic_inc(&ns->passive);
+ refcount_inc(&ns->passive);
#endif
return ns;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1934efd4a9d4..8726d051f31d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
LIST_HEAD(net_exit_list);
atomic_set(&net->count, 1);
- atomic_set(&net->passive, 1);
+ refcount_set(&net->passive, 1);
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
@@ -315,6 +315,25 @@ out_undo:
goto out;
}
+static int __net_init net_defaults_init_net(struct net *net)
+{
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ return 0;
+}
+
+static struct pernet_operations net_defaults_ops = {
+ .init = net_defaults_init_net,
+};
+
+static __init int net_defaults_init(void)
+{
+ if (register_pernet_subsys(&net_defaults_ops))
+ panic("Cannot initialize net default settings");
+
+ return 0;
+}
+
+core_initcall(net_defaults_init);
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
@@ -361,7 +380,7 @@ static void net_free(struct net *net)
void net_drop_ns(void *p)
{
struct net *ns = p;
- if (ns && atomic_dec_and_test(&ns->passive))
+ if (ns && refcount_dec_and_test(&ns->passive))
net_free(ns);
}
@@ -482,6 +501,23 @@ static void cleanup_net(struct work_struct *work)
net_drop_ns(net);
}
}
+
+/**
+ * net_ns_barrier - wait until concurrent net_cleanup_work is done
+ *
+ * cleanup_net runs from work queue and will first remove namespaces
+ * from the global list, then run net exit functions.
+ *
+ * Call this in module exit path to make sure that all netns
+ * ->exit ops have been invoked before the function is removed.
+ */
+void net_ns_barrier(void)
+{
+ mutex_lock(&net_mutex);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL(net_ns_barrier);
+
static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
@@ -577,6 +613,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ struct nlattr *nla;
struct net *peer;
int nsid, err;
@@ -584,23 +621,35 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
rtnl_net_policy, extack);
if (err < 0)
return err;
- if (!tb[NETNSA_NSID])
+ if (!tb[NETNSA_NSID]) {
+ NL_SET_ERR_MSG(extack, "nsid is missing");
return -EINVAL;
+ }
nsid = nla_get_s32(tb[NETNSA_NSID]);
- if (tb[NETNSA_PID])
+ if (tb[NETNSA_PID]) {
peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
- else if (tb[NETNSA_FD])
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
- else
+ nla = tb[NETNSA_FD];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
return -EINVAL;
- if (IS_ERR(peer))
+ }
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
return PTR_ERR(peer);
+ }
spin_lock_bh(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
spin_unlock_bh(&net->nsid_lock);
err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack,
+ "Peer netns already has a nsid assigned");
goto out;
}
@@ -609,6 +658,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err);
err = 0;
+ } else if (err == -ENOSPC && nsid >= 0) {
+ err = -EEXIST;
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
+ NL_SET_ERR_MSG(extack, "The specified nsid is already used");
}
out:
put_net(peer);
@@ -651,6 +704,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ struct nlattr *nla;
struct sk_buff *msg;
struct net *peer;
int err, id;
@@ -659,15 +713,22 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
rtnl_net_policy, extack);
if (err < 0)
return err;
- if (tb[NETNSA_PID])
+ if (tb[NETNSA_PID]) {
peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
- else if (tb[NETNSA_FD])
+ nla = tb[NETNSA_PID];
+ } else if (tb[NETNSA_FD]) {
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
- else
+ nla = tb[NETNSA_FD];
+ } else {
+ NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
return -EINVAL;
+ }
- if (IS_ERR(peer))
+ if (IS_ERR(peer)) {
+ NL_SET_BAD_ATTR(extack, nla);
+ NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
return PTR_ERR(peer);
+ }
msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
if (!msg) {
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 29be2466970c..d3408a693166 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -277,7 +277,7 @@ static void zap_completion_queue(void)
struct sk_buff *skb = clist;
clist = clist->next;
if (!skb_irq_freeable(skb)) {
- atomic_inc(&skb->users);
+ refcount_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
} else {
__kfree_skb(skb);
@@ -309,7 +309,7 @@ repeat:
return NULL;
}
- atomic_set(&skb->users, 1);
+ refcount_set(&skb->users, 1);
skb_reserve(skb, reserve);
return skb;
}
@@ -441,7 +441,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
ip6h->saddr = np->local_ip.in6;
ip6h->daddr = np->remote_ip.in6;
- eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ eth = skb_push(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
} else {
@@ -470,7 +470,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
put_unaligned(np->remote_ip.ip, &(iph->daddr));
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
- eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ eth = skb_push(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb->protocol = eth->h_proto = htons(ETH_P_IP);
}
@@ -632,7 +632,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
- atomic_set(&npinfo->refcnt, 1);
+ refcount_set(&npinfo->refcnt, 1);
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_setup) {
@@ -642,7 +642,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
}
} else {
npinfo = rtnl_dereference(ndev->npinfo);
- atomic_inc(&npinfo->refcnt);
+ refcount_inc(&npinfo->refcnt);
}
npinfo->netpoll = np;
@@ -821,7 +821,7 @@ void __netpoll_cleanup(struct netpoll *np)
synchronize_srcu(&netpoll_srcu);
- if (atomic_dec_and_test(&npinfo->refcnt)) {
+ if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
ops = np->dev->netdev_ops;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 96947f5d41e4..6e1e10ff433a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2675,7 +2675,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
goto err;
}
/* restore ll */
- eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+ eth = skb_push(skb, ETH_HLEN);
memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
eth->h_proto = protocol;
@@ -2714,11 +2714,11 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
struct timeval timestamp;
struct pktgen_hdr *pgh;
- pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ pgh = skb_put(skb, sizeof(*pgh));
datalen -= sizeof(*pgh);
if (pkt_dev->nfrags <= 0) {
- memset(skb_put(skb, datalen), 0, datalen);
+ skb_put_zero(skb, datalen);
} else {
int frags = pkt_dev->nfrags;
int i, len;
@@ -2729,7 +2729,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
frags = MAX_SKB_FRAGS;
len = datalen - frags * PAGE_SIZE;
if (len > 0) {
- memset(skb_put(skb, len), 0, len);
+ skb_put_zero(skb, len);
datalen = frags * PAGE_SIZE;
}
@@ -2844,34 +2844,35 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
- eth = (__u8 *) skb_push(skb, 14);
- mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
if (pkt_dev->nr_labels)
mpls_push(mpls, pkt_dev);
if (pkt_dev->vlan_id != 0xffff) {
if (pkt_dev->svlan_id != 0xffff) {
- svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ svlan_tci = skb_put(skb, sizeof(__be16));
*svlan_tci = build_tci(pkt_dev->svlan_id,
pkt_dev->svlan_cfi,
pkt_dev->svlan_p);
- svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
*svlan_encapsulated_proto = htons(ETH_P_8021Q);
}
- vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_tci = skb_put(skb, sizeof(__be16));
*vlan_tci = build_tci(pkt_dev->vlan_id,
pkt_dev->vlan_cfi,
pkt_dev->vlan_p);
- vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
*vlan_encapsulated_proto = htons(ETH_P_IP);
}
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
- iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
+ iph = skb_put(skb, sizeof(struct iphdr));
skb_set_transport_header(skb, skb->len);
- udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ udph = skb_put(skb, sizeof(struct udphdr));
skb_set_queue_mapping(skb, queue_map);
skb->priority = pkt_dev->skb_priority;
@@ -2971,34 +2972,35 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
- eth = (__u8 *) skb_push(skb, 14);
- mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ eth = skb_push(skb, 14);
+ mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32));
if (pkt_dev->nr_labels)
mpls_push(mpls, pkt_dev);
if (pkt_dev->vlan_id != 0xffff) {
if (pkt_dev->svlan_id != 0xffff) {
- svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ svlan_tci = skb_put(skb, sizeof(__be16));
*svlan_tci = build_tci(pkt_dev->svlan_id,
pkt_dev->svlan_cfi,
pkt_dev->svlan_p);
- svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ svlan_encapsulated_proto = skb_put(skb,
+ sizeof(__be16));
*svlan_encapsulated_proto = htons(ETH_P_8021Q);
}
- vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_tci = skb_put(skb, sizeof(__be16));
*vlan_tci = build_tci(pkt_dev->vlan_id,
pkt_dev->vlan_cfi,
pkt_dev->vlan_p);
- vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ vlan_encapsulated_proto = skb_put(skb, sizeof(__be16));
*vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
- iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
+ iph = skb_put(skb, sizeof(struct ipv6hdr));
skb_set_transport_header(skb, skb->len);
- udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ udph = skb_put(skb, sizeof(struct udphdr));
skb_set_queue_mapping(skb, queue_map);
skb->priority = pkt_dev->skb_priority;
@@ -3361,7 +3363,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
{
ktime_t idle_start = ktime_get();
- while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+ while (refcount_read(&(pkt_dev->skb->users)) != 1) {
if (signal_pending(current))
break;
@@ -3418,7 +3420,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- atomic_add(burst, &skb->users);
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
@@ -3426,11 +3428,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
- if (atomic_read(&skb->users) != burst) {
+ if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
*/
- atomic_sub(burst - 1, &skb->users);
+ WARN_ON(refcount_sub_and_test(burst - 1, &skb->users));
/* get out of the loop and wait
* until skb is consumed
*/
@@ -3444,7 +3446,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- atomic_inc(&pkt_dev->skb->users);
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
switch (ret) {
@@ -3485,7 +3487,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- atomic_add(burst, &pkt_dev->skb->users);
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
@@ -3511,11 +3513,11 @@ xmit_more:
/* fallthru */
case NETDEV_TX_BUSY:
/* Retry it next time */
- atomic_dec(&(pkt_dev->skb->users));
+ refcount_dec(&(pkt_dev->skb->users));
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
- atomic_sub(burst, &pkt_dev->skb->users);
+ WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users));
unlock:
HARD_TX_UNLOCK(odev, txq);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610ee42..d1ba90980be1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -16,6 +16,7 @@
* Vitaly E. Lavrov RTA_OK arithmetics was wrong.
*/
+#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -39,6 +40,7 @@
#include <linux/if_vlan.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
+#include <linux/bpf.h>
#include <linux/uaccess.h>
@@ -647,7 +649,7 @@ int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int g
NETLINK_CB(skb).dst_group = group;
if (echo)
- atomic_inc(&skb->users);
+ refcount_inc(&skb->users);
netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
if (echo)
err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
@@ -900,7 +902,7 @@ static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_FLAGS */
+ nla_total_size(4); /* XDP_PROG_ID */
return xdp_size;
}
@@ -932,6 +934,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(4) /* IFLA_GROUP */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
@@ -942,6 +945,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ rtnl_xdp_size() /* IFLA_XDP */
+ + nla_total_size(4) /* IFLA_EVENT */
+ nla_total_size(1); /* IFLA_PROTO_DOWN */
}
@@ -1125,6 +1129,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
+ memset(&ivi, 0, sizeof(ivi));
+
/* Not all SR-IOV capable drivers support the
* spoofcheck and "RSS query enable" query. Preset to
* -1 so the user space tool can detect that the driver
@@ -1133,7 +1139,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
ivi.spoofchk = -1;
ivi.rss_query_en = -1;
ivi.trusted = -1;
- memset(ivi.mac, 0, sizeof(ivi.mac));
/* The default value for VF link state is "auto"
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
@@ -1247,37 +1252,46 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ const struct bpf_prog *generic_xdp_prog;
+
+ ASSERT_RTNL();
+
+ *prog_id = 0;
+ generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
+ if (generic_xdp_prog) {
+ *prog_id = generic_xdp_prog->aux->id;
+ return XDP_ATTACHED_SKB;
+ }
+ if (!ops->ndo_xdp)
+ return XDP_ATTACHED_NONE;
+
+ return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id);
+}
+
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
{
struct nlattr *xdp;
- u32 xdp_flags = 0;
- u8 val = 0;
+ u32 prog_id;
int err;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- if (rcu_access_pointer(dev->xdp_prog)) {
- xdp_flags = XDP_FLAGS_SKB_MODE;
- val = 1;
- } else if (dev->netdev_ops->ndo_xdp) {
- struct netdev_xdp xdp_op = {};
-
- xdp_op.command = XDP_QUERY_PROG;
- err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
- if (err)
- goto err_cancel;
- val = xdp_op.prog_attached;
- }
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+ rtnl_xdp_attached_mode(dev, &prog_id));
if (err)
goto err_cancel;
- if (xdp_flags) {
- err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
+ if (prog_id) {
+ err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
if (err)
goto err_cancel;
}
+
nla_nest_end(skb, xdp);
return 0;
@@ -1286,9 +1300,40 @@ err_cancel:
return err;
}
+static u32 rtnl_get_event(unsigned long event)
+{
+ u32 rtnl_event_type = IFLA_EVENT_NONE;
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ rtnl_event_type = IFLA_EVENT_REBOOT;
+ break;
+ case NETDEV_FEAT_CHANGE:
+ rtnl_event_type = IFLA_EVENT_FEATURES;
+ break;
+ case NETDEV_BONDING_FAILOVER:
+ rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
+ break;
+ case NETDEV_NOTIFY_PEERS:
+ rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
+ break;
+ case NETDEV_RESEND_IGMP:
+ rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
+ break;
+ case NETDEV_CHANGEINFODATA:
+ rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
+ break;
+ default:
+ break;
+ }
+
+ return rtnl_event_type;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
- unsigned int flags, u32 ext_filter_mask)
+ unsigned int flags, u32 ext_filter_mask,
+ u32 event)
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
@@ -1337,6 +1382,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
+ if (event != IFLA_EVENT_NONE) {
+ if (nla_put_u32(skb, IFLA_EVENT, event))
+ goto nla_put_failure;
+ }
+
if (rtnl_fill_link_ifmap(skb, dev))
goto nla_put_failure;
@@ -1471,6 +1521,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_LINK_NETNSID] = { .type = NLA_S32 },
[IFLA_PROTO_DOWN] = { .type = NLA_U8 },
[IFLA_XDP] = { .type = NLA_NESTED },
+ [IFLA_EVENT] = { .type = NLA_U32 },
+ [IFLA_GROUP] = { .type = NLA_U32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1518,6 +1570,7 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
+ [IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -1630,14 +1683,14 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
flags,
- ext_filter_mask);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+ ext_filter_mask, 0);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
@@ -1645,10 +1698,12 @@ cont:
}
}
out:
+ err = skb->len;
+out_err:
cb->args[1] = idx;
cb->args[0] = h;
- return skb->len;
+ return err;
}
int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
@@ -2050,8 +2105,8 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_TXQLEN]) {
- unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
- unsigned long orig_len = dev->tx_queue_len;
+ unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
+ unsigned int orig_len = dev->tx_queue_len;
if (dev->tx_queue_len ^ value) {
dev->tx_queue_len = value;
@@ -2188,7 +2243,7 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
goto errout;
- if (xdp[IFLA_XDP_ATTACHED]) {
+ if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
err = -EINVAL;
goto errout;
}
@@ -2199,6 +2254,10 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+ if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
+ err = -EINVAL;
+ goto errout;
+ }
}
if (xdp[IFLA_XDP_FD]) {
@@ -2523,7 +2582,7 @@ replay:
data = attr;
}
if (ops->validate) {
- err = ops->validate(tb, data);
+ err = ops->validate(tb, data, extack);
if (err < 0)
return err;
}
@@ -2542,7 +2601,8 @@ replay:
slave_data = slave_attr;
}
if (m_ops->slave_validate) {
- err = m_ops->slave_validate(tb, slave_data);
+ err = m_ops->slave_validate(tb, slave_data,
+ extack);
if (err < 0)
return err;
}
@@ -2561,7 +2621,7 @@ replay:
!ops->changelink)
return -EOPNOTSUPP;
- err = ops->changelink(dev, tb, data);
+ err = ops->changelink(dev, tb, data, extack);
if (err < 0)
return err;
status |= DO_SETLINK_NOTIFY;
@@ -2572,7 +2632,8 @@ replay:
return -EOPNOTSUPP;
err = m_ops->slave_changelink(master_dev, dev,
- tb, slave_data);
+ tb, slave_data,
+ extack);
if (err < 0)
return err;
status |= DO_SETLINK_NOTIFY;
@@ -2646,7 +2707,8 @@ replay:
dev->ifindex = ifm->ifi_index;
if (ops->newlink) {
- err = ops->newlink(link_net ? : net, dev, tb, data);
+ err = ops->newlink(link_net ? : net, dev, tb, data,
+ extack);
/* Drivers should call free_netdev() in ->destructor
* and unregister it on failure after registration
* so that device could be finally freed in rtnl_unlock.
@@ -2733,7 +2795,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
return -ENOBUFS;
err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, 0, 0, ext_filter_mask);
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
@@ -2805,7 +2867,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
}
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
- unsigned int change, gfp_t flags)
+ unsigned int change,
+ u32 event, gfp_t flags)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2816,7 +2879,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
if (skb == NULL)
goto errout;
- err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
+ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -2837,18 +2900,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
}
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
- gfp_t flags)
+static void rtmsg_ifinfo_event(int type, struct net_device *dev,
+ unsigned int change, u32 event,
+ gfp_t flags)
{
struct sk_buff *skb;
if (dev->reg_state != NETREG_REGISTERED)
return;
- skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
if (skb)
rtmsg_ifinfo_send(skb, dev, flags);
}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags);
+}
EXPORT_SYMBOL(rtmsg_ifinfo);
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -3228,8 +3298,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int err = 0;
int fidx = 0;
- if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
- IFLA_MAX, ifla_policy, NULL) == 0) {
+ err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, NULL);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
if (tb[IFLA_MASTER])
br_idx = nla_get_u32(tb[IFLA_MASTER]);
}
@@ -3452,8 +3525,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
err = br_dev->netdev_ops->ndo_bridge_getlink(
skb, portid, seq, dev,
filter_mask, NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
@@ -3464,16 +3541,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
seq, dev,
filter_mask,
NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
}
+ err = skb->len;
+out_err:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static inline size_t bridge_nlmsg_size(void)
@@ -4140,6 +4223,18 @@ static void rtnetlink_rcv(struct sk_buff *skb)
rtnl_unlock();
}
+static int rtnetlink_bind(struct net *net, int group)
+{
+ switch (group) {
+ case RTNLGRP_IPV4_MROUTE_R:
+ case RTNLGRP_IPV6_MROUTE_R:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ }
+ return 0;
+}
+
static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -4152,7 +4247,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
case NETDEV_NOTIFY_PEERS:
case NETDEV_RESEND_IGMP:
case NETDEV_CHANGEINFODATA:
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
+ GFP_KERNEL);
break;
default:
break;
@@ -4173,6 +4269,7 @@ static int __net_init rtnetlink_net_init(struct net *net)
.input = rtnetlink_rcv,
.cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
+ .bind = rtnetlink_bind,
};
sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index ae35cce3a40d..7232274de334 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -51,7 +51,8 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
+u32 secure_tcpv6_ts_off(const struct net *net,
+ const __be32 *saddr, const __be32 *daddr)
{
const struct {
struct in6_addr saddr;
@@ -61,7 +62,7 @@ u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
.daddr = *(struct in6_addr *)daddr,
};
- if (sysctl_tcp_timestamps != 1)
+ if (net->ipv4.sysctl_tcp_timestamps != 1)
return 0;
ts_secret_init();
@@ -113,9 +114,9 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
+u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
{
- if (sysctl_tcp_timestamps != 1)
+ if (net->ipv4.sysctl_tcp_timestamps != 1)
return 0;
ts_secret_init();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 346d3e85dfbc..8b11341ed69a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -176,7 +176,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->head = NULL;
skb->truesize = sizeof(struct sk_buff);
- atomic_set(&skb->users, 1);
+ refcount_set(&skb->users, 1);
skb->mac_header = (typeof(skb->mac_header))~0U;
out:
@@ -247,7 +247,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
/* Account for allocated memory : skb + skb->head */
skb->truesize = SKB_TRUESIZE(size);
skb->pfmemalloc = pfmemalloc;
- atomic_set(&skb->users, 1);
+ refcount_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
@@ -268,7 +268,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
skb->fclone = SKB_FCLONE_ORIG;
- atomic_set(&fclones->fclone_ref, 1);
+ refcount_set(&fclones->fclone_ref, 1);
fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
@@ -314,7 +314,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
- atomic_set(&skb->users, 1);
+ refcount_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
@@ -629,7 +629,7 @@ static void kfree_skbmem(struct sk_buff *skb)
* This test would have no chance to be true for the clone,
* while here, branch prediction will be good.
*/
- if (atomic_read(&fclones->fclone_ref) == 1)
+ if (refcount_read(&fclones->fclone_ref) == 1)
goto fastpath;
break;
@@ -637,18 +637,16 @@ static void kfree_skbmem(struct sk_buff *skb)
fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
- if (!atomic_dec_and_test(&fclones->fclone_ref))
+ if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
fastpath:
kmem_cache_free(skbuff_fclone_cache, fclones);
}
-static void skb_release_head_state(struct sk_buff *skb)
+void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
-#ifdef CONFIG_XFRM
- secpath_put(skb->sp);
-#endif
+ secpath_reset(skb);
if (skb->destructor) {
WARN_ON(in_irq());
skb->destructor(skb);
@@ -694,12 +692,9 @@ EXPORT_SYMBOL(__kfree_skb);
*/
void kfree_skb(struct sk_buff *skb)
{
- if (unlikely(!skb))
- return;
- if (likely(atomic_read(&skb->users) == 1))
- smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
+ if (!skb_unref(skb))
return;
+
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
@@ -746,17 +741,32 @@ EXPORT_SYMBOL(skb_tx_error);
*/
void consume_skb(struct sk_buff *skb)
{
- if (unlikely(!skb))
- return;
- if (likely(atomic_read(&skb->users) == 1))
- smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
+ if (!skb_unref(skb))
return;
+
trace_consume_skb(skb);
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
+/**
+ * consume_stateless_skb - free an skbuff, assuming it is stateless
+ * @skb: buffer to free
+ *
+ * Works like consume_skb(), but this variant assumes that all the head
+ * states have been already dropped.
+ */
+void consume_stateless_skb(struct sk_buff *skb)
+{
+ if (!skb_unref(skb))
+ return;
+
+ trace_consume_skb(skb);
+ if (likely(skb->head))
+ skb_release_data(skb);
+ kfree_skbmem(skb);
+}
+
void __kfree_skb_flush(void)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -807,10 +817,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- if (likely(atomic_read(&skb->users) == 1))
- smp_rmb();
- else if (likely(!atomic_dec_and_test(&skb->users)))
+ if (!skb_unref(skb))
return;
+
/* if reaching here SKB is ready to free */
trace_consume_skb(skb);
@@ -906,7 +915,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(head_frag);
C(data);
C(truesize);
- atomic_set(&n->users, 1);
+ refcount_set(&n->users, 1);
atomic_inc(&(skb_shinfo(skb)->dataref));
skb->cloned = 1;
@@ -1018,9 +1027,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
return NULL;
if (skb->fclone == SKB_FCLONE_ORIG &&
- atomic_read(&fclones->fclone_ref) == 1) {
+ refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
- atomic_set(&fclones->fclone_ref, 2);
+ refcount_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -1412,7 +1421,7 @@ EXPORT_SYMBOL(skb_pad);
* returned.
*/
-unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
if (tail != skb) {
skb->data_len += len;
@@ -1431,9 +1440,9 @@ EXPORT_SYMBOL_GPL(pskb_put);
* exceed the total buffer size the kernel will panic. A pointer to the
* first byte of the extra data is returned.
*/
-unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
+void *skb_put(struct sk_buff *skb, unsigned int len)
{
- unsigned char *tmp = skb_tail_pointer(skb);
+ void *tmp = skb_tail_pointer(skb);
SKB_LINEAR_ASSERT(skb);
skb->tail += len;
skb->len += len;
@@ -1452,7 +1461,7 @@ EXPORT_SYMBOL(skb_put);
* start. If this would exceed the total buffer headroom the kernel will
* panic. A pointer to the first byte of the extra data is returned.
*/
-unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
+void *skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data -= len;
skb->len += len;
@@ -1472,7 +1481,7 @@ EXPORT_SYMBOL(skb_push);
* is returned. Once the data has been pulled future pushes will overwrite
* the old data.
*/
-unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
+void *skb_pull(struct sk_buff *skb, unsigned int len)
{
return skb_pull_inline(skb, len);
}
@@ -1607,7 +1616,7 @@ EXPORT_SYMBOL(___pskb_trim);
*
* It is pretty complicated. Luckily, it is called only in exceptional cases.
*/
-unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
+void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
/* If skb has not enough free space at tail, get new one
* plus 128 bytes for future expansions. If we have enough
@@ -2243,6 +2252,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
+{
+ net_warn_ratelimited(
+ "%s: attempt to compute crc32c without libcrc32c.ko\n",
+ __func__);
+ return 0;
+}
+
+static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
+ int offset, int len)
+{
+ net_warn_ratelimited(
+ "%s: attempt to compute crc32c without libcrc32c.ko\n",
+ __func__);
+ return 0;
+}
+
+static const struct skb_checksum_ops default_crc32c_ops = {
+ .update = warn_crc32c_csum_update,
+ .combine = warn_crc32c_csum_combine,
+};
+
+const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
+ &default_crc32c_ops;
+EXPORT_SYMBOL(crc32c_csum_stub);
+
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -2620,7 +2655,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
- skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
+ skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
@@ -2988,7 +3024,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
get_page(pfrag->page);
skb->truesize += copy;
- atomic_add(copy, &sk->sk_wmem_alloc);
+ refcount_add(copy, &sk->sk_wmem_alloc);
skb->len += copy;
skb->data_len += copy;
offset += copy;
@@ -3029,7 +3065,7 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags);
* that the checksum difference is zero (e.g., a valid IP header)
* or you are setting ip_summed to CHECKSUM_NONE.
*/
-unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
unsigned char *data = skb->data;
@@ -3235,8 +3271,8 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
while (pos < offset + len) {
if (i >= nfrags) {
@@ -3482,24 +3518,18 @@ void __init skb_init(void)
NULL);
}
-/**
- * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
- * @skb: Socket buffer containing the buffers to be mapped
- * @sg: The scatter-gather list to map into
- * @offset: The offset into the buffer's contents to start mapping
- * @len: Length of buffer space to be mapped
- *
- * Fill the specified scatter-gather list with mappings/pointers into a
- * region of the buffer space attached to a socket buffer.
- */
static int
-__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
+ unsigned int recursion_level)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int elt = 0;
+ if (unlikely(recursion_level >= 24))
+ return -EMSGSIZE;
+
if (copy > 0) {
if (copy > len)
copy = len;
@@ -3518,6 +3548,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
if (copy > len)
copy = len;
@@ -3532,16 +3564,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
}
skb_walk_frags(skb, frag_iter) {
- int end;
+ int end, ret;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
+ if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+ return -EMSGSIZE;
+
if (copy > len)
copy = len;
- elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
- copy);
+ ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+ copy, recursion_level + 1);
+ if (unlikely(ret < 0))
+ return ret;
+ elt += ret;
if ((len -= copy) == 0)
return elt;
offset += copy;
@@ -3552,6 +3590,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
return elt;
}
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer. Returns either
+ * the number of scatterlist items used, or -EMSGSIZE if the contents
+ * could not fit.
+ */
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
+
+ if (nsg <= 0)
+ return nsg;
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
* sglist without mark the sg which contain last skb data as the end.
* So the caller can mannipulate sg list as will when padding new data after
@@ -3574,19 +3637,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len)
{
- return __skb_to_sgvec(skb, sg, offset, len);
+ return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
-int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
-{
- int nsg = __skb_to_sgvec(skb, sg, offset, len);
- sg_mark_end(&sg[nsg - 1]);
-
- return nsg;
-}
-EXPORT_SYMBOL_GPL(skb_to_sgvec);
/**
* skb_cow_data - Check that a socket buffer's data buffers are writable
@@ -3754,8 +3809,11 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
- if (skb && (skb_next = skb_peek(q)))
+ if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
+ if (icmp_next)
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ }
spin_unlock_irqrestore(&q->lock, flags);
if (is_icmp_err_skb(skb) && !icmp_next)
@@ -3786,7 +3844,7 @@ struct sk_buff *skb_clone_sk(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct sk_buff *clone;
- if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+ if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
clone = skb_clone(skb, GFP_ATOMIC);
@@ -3857,7 +3915,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
/* Take a reference to prevent skb_orphan() from freeing the socket,
* but only if the socket refcount is not zero.
*/
- if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
*skb_hwtstamps(skb) = *hwtstamps;
__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
sock_put(sk);
@@ -3875,6 +3933,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+ return;
+
tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
if (!skb_may_tx_timestamp(sk, tsonly))
return;
@@ -3896,7 +3958,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
return;
if (tsonly) {
- skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags;
+ skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
+ SKBTX_ANY_TSTAMP;
skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
}
@@ -3934,7 +3997,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
/* Take a reference to prevent skb_orphan() from freeing the socket,
* but only if the socket refcount is not zero.
*/
- if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
+ if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
err = sock_queue_err_skb(sk, skb);
sock_put(sk);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 79c6aee6af9b..ac2a404c73eb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,10 +139,7 @@
#include <trace/events/sock.h>
-#ifdef CONFIG_INET
#include <net/tcp.h>
-#endif
-
#include <net/busy_poll.h>
static DEFINE_MUTEX(proto_list_mutex);
@@ -1041,6 +1038,10 @@ set_rcvbuf:
#endif
case SO_MAX_PACING_RATE:
+ if (val != ~0U)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
sk->sk_max_pacing_rate = val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
@@ -1077,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
+static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+
+ for (i = 0; i < src->ngroups; i++)
+ if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ return -EFAULT;
+
+ return 0;
+}
+
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1230,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
goto lenout;
}
+ case SO_PEERGROUPS:
+ {
+ int ret, n;
+
+ if (!sk->sk_peer_cred)
+ return -ENODATA;
+
+ n = sk->sk_peer_cred->group_info->ngroups;
+ if (len < n * sizeof(gid_t)) {
+ len = n * sizeof(gid_t);
+ return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ }
+ len = n * sizeof(gid_t);
+
+ ret = groups_to_user((gid_t __user *)optval,
+ sk->sk_peer_cred->group_info);
+ if (ret)
+ return ret;
+ goto lenout;
+ }
+
case SO_PEERNAME:
{
char address[128];
@@ -1494,7 +1528,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
- atomic_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, 1);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -1518,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_destruct(sk);
filter = rcu_dereference_check(sk->sk_filter,
- atomic_read(&sk->sk_wmem_alloc) == 0);
+ refcount_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
@@ -1568,7 +1602,7 @@ void sk_free(struct sock *sk)
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
- if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sk_free);
@@ -1625,7 +1659,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
/*
* sk_wmem_alloc set to one (see sk_free() and sock_wfree())
*/
- atomic_set(&newsk->sk_wmem_alloc, 1);
+ refcount_set(&newsk->sk_wmem_alloc, 1);
atomic_set(&newsk->sk_omem_alloc, 0);
sk_init_common(newsk);
@@ -1674,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&newsk->sk_refcnt, 2);
+ refcount_set(&newsk->sk_refcnt, 2);
/*
* Increment the counter in the same struct proto as the master
@@ -1753,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb)
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
*/
- atomic_sub(len - 1, &sk->sk_wmem_alloc);
+ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
len = 1;
}
@@ -1761,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb)
* if sk_wmem_alloc reaches 0, we must finish what sk_free()
* could not do because of in-flight packets
*/
- if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);
@@ -1773,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
__sk_free(sk);
}
@@ -1795,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
* is enough to guarantee sk_free() wont free this sock until
* all in-flight packets are completed
*/
- atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);
@@ -1803,28 +1837,24 @@ EXPORT_SYMBOL(skb_set_owner_w);
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
* But we also want to keep skb->sk set because some packet schedulers
- * rely on it (sch_fq for example). So we set skb->truesize to a small
- * amount (1) and decrease sk_wmem_alloc accordingly.
+ * rely on it (sch_fq for example).
*/
void skb_orphan_partial(struct sk_buff *skb)
{
- /* If this skb is a TCP pure ACK or already went here,
- * we have nothing to do. 2 is already a very small truesize.
- */
- if (skb->truesize <= 2)
+ if (skb_is_tcp_pure_ack(skb))
return;
- /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
- * so we do not completely orphan skb, but transfert all
- * accounted bytes but one, to avoid unexpected reorders.
- */
if (skb->destructor == sock_wfree
#ifdef CONFIG_INET
|| skb->destructor == tcp_wfree
#endif
) {
- atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
- skb->truesize = 1;
+ struct sock *sk = skb->sk;
+
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
+ skb->destructor = sock_efree;
+ }
} else {
skb_orphan(skb);
}
@@ -1882,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
@@ -1957,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2079,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
}
EXPORT_SYMBOL(sock_cmsg_send);
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+ if (!sk->sk_prot->enter_memory_pressure)
+ return;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+ if (sk->sk_prot->leave_memory_pressure) {
+ sk->sk_prot->leave_memory_pressure(sk);
+ } else {
+ unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+ if (memory_pressure && *memory_pressure)
+ *memory_pressure = 0;
+ }
+}
+
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
@@ -2260,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
return 1;
- } else if (atomic_read(&sk->sk_wmem_alloc) <
+ } else if (refcount_read(&sk->sk_wmem_alloc) <
prot->sysctl_wmem[0])
return 1;
}
@@ -2527,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
@@ -2637,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&sk->sk_refcnt, 1);
+ refcount_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
@@ -2682,9 +2732,12 @@ EXPORT_SYMBOL(release_sock);
* @sk: socket
*
* This version should be used for very small section, where process wont block
- * return false if fast path is taken
+ * return false if fast path is taken:
+ *
* sk_lock.slock locked, owned = 0, BH disabled
- * return true if slow path is taken
+ *
+ * return true if slow path is taken:
+ *
* sk_lock.slock unlocked, owned = 1, BH enabled
*/
bool lock_sock_fast(struct sock *sk)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index ea23254b2457..b7cd9aafe99e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -479,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
- net->core.sysctl_somaxconn = SOMAXCONN;
-
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);