From d1174416747d790d750742d0514915deeed93acf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 May 2017 11:22:52 -0700 Subject: bpf: Track alignment of register values in the verifier. Currently if we add only constant values to pointers we can fully validate the alignment, and properly check if we need to reject the program on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS architectures. However, once an unknown value is introduced we only allow byte sized memory accesses which is too restrictive. Add logic to track the known minimum alignment of register values, and propagate this state into registers containing pointers. The most common paradigm that makes use of this new logic is computing the transport header using the IP header length field. For example: struct ethhdr *ep = skb->data; struct iphdr *iph = (struct iphdr *) (ep + 1); struct tcphdr *th; ... n = iph->ihl; th = ((void *)iph + (n * 4)); port = th->dest; The existing code will reject the load of th->dest because it cannot validate that the alignment is at least 2 once "n * 4" is added the the packet pointer. In the new code, the register holding "n * 4" will have a reg->min_align value of 4, because any value multiplied by 4 will be at least 4 byte aligned. (actually, the eBPF code emitted by the compiler in this case is most likely to use a shift left by 2, but the end result is identical) At the critical addition: th = ((void *)iph + (n * 4)); The register holding 'th' will start with reg->off value of 14. The pointer addition will transform that reg into something that looks like: reg->aux_off = 14 reg->aux_off_align = 4 Next, the verifier will look at the th->dest load, and it will see a load offset of 2, and first check: if (reg->aux_off_align % size) which will pass because aux_off_align is 4. reg_off will be computed: reg_off = reg->off; ... reg_off += reg->aux_off; plus we have off==2, and it will thus check: if ((NET_IP_ALIGN + reg_off + off) % size != 0) which evaluates to: if ((NET_IP_ALIGN + 14 + 2) % size != 0) On strict alignment architectures, NET_IP_ALIGN is 2, thus: if ((2 + 14 + 2) % size != 0) which passes. These pointer transformations and checks work regardless of whether the constant offset or the variable with known alignment is added first to the pointer register. Signed-off-by: David S. Miller Acked-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5efb4db44e1e..7c6a51924afc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -40,6 +40,9 @@ struct bpf_reg_state { */ s64 min_value; u64 max_value; + u32 min_align; + u32 aux_off; + u32 aux_off_align; }; enum bpf_stack_slot_type { -- cgit From e07b98d9bffe410019dfcf62c3428d4a96c56a2c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 May 2017 11:38:07 -0700 Subject: bpf: Add strict alignment flag for BPF_PROG_LOAD. Add a new field, "prog_flags", and an initial flag value BPF_F_STRICT_ALIGNMENT. When set, the verifier will enforce strict pointer alignment regardless of the setting of CONFIG_EFFICIENT_UNALIGNED_ACCESS. The verifier, in this mode, will also use a fixed value of "2" in place of NET_IP_ALIGN. This facilitates test cases that will exercise and validate this part of the verifier even when run on architectures where alignment doesn't matter. Signed-off-by: David S. Miller Acked-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7c6a51924afc..d5093b52b485 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -90,6 +90,7 @@ struct bpf_verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ + bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 945a1f5f63c5..94dfa9def355 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,13 @@ enum bpf_attach_type { */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -177,6 +184,7 @@ union bpf_attr { __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 prog_flags; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit From 0489df9a430e9607de8130a6bc4bf4c02f96eaf1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 12 May 2017 01:04:45 +0200 Subject: xdp: add flag to enforce driver mode After commit b5cdae3291f7 ("net: Generic XDP") we automatically fall back to a generic XDP variant if the driver does not support native XDP. Allow for an option where the user can specify that always the native XDP variant should be selected and in case it's not supported by a driver, just bail out. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8e56ac70e0d1..549ac8a98b44 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -888,9 +888,11 @@ enum { /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) -#define XDP_FLAGS_SKB_MODE (2U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ - XDP_FLAGS_SKB_MODE) + XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE) enum { IFLA_XDP_UNSPEC, -- cgit From d67b9cd28c1d7f82c2e5e727731ea7c89b23a0a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 12 May 2017 01:04:46 +0200 Subject: xdp: refine xdp api with regards to generic xdp While working on the iproute2 generic XDP frontend, I noticed that as of right now it's possible to have native *and* generic XDP programs loaded both at the same time for the case when a driver supports native XDP. The intended model for generic XDP from b5cdae3291f7 ("net: Generic XDP") is, however, that only one out of the two can be present at once which is also indicated as such in the XDP netlink dump part. The main rationale for generic XDP is to ease accessibility (in case a driver does not yet have XDP support) and to generically provide a semantical model as an example for driver developers wanting to add XDP support. The generic XDP option for an XDP aware driver can still be useful for comparing and testing both implementations. However, it is not intended to have a second XDP processing stage or layer with exactly the same functionality of the first native stage. Only reason could be to have a partial fallback for future XDP features that are not supported yet in the native implementation and we probably also shouldn't strive for such fallback and instead encourage native feature support in the first place. Given there's currently no such fallback issue or use case, lets not go there yet if we don't need to. Therefore, change semantics for loading XDP and bail out if the user tries to load a generic XDP program when a native one is present and vice versa. Another alternative to bailing out would be to handle the transition from one flavor to another gracefully, but that would require to bring the device down, exchange both types of programs, and bring it up again in order to avoid a tiny window where a packet could hit both hooks. Given this complicates the logic for just a debugging feature in the native case, I went with the simpler variant. For the dump, remove IFLA_XDP_FLAGS that was added with b5cdae3291f7 and reuse IFLA_XDP_ATTACHED for indicating the mode. Dumping all or just a subset of flags that were used for loading the XDP prog is suboptimal in the long run since not all flags are useful for dumping and if we start to reuse the same flag definitions for load and dump, then we'll waste bit space. What we really just want is to dump the mode for now. Current IFLA_XDP_ATTACHED semantics are: nothing was installed (0), a program is running at the native driver layer (1). Thus, add a mode that says that a program is running at generic XDP layer (2). Applications will handle this fine in that older binaries will just indicate that something is attached at XDP layer, effectively this is similar to IFLA_XDP_FLAGS attr that we would have had modulo the redundancy. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- include/uapi/linux/if_link.h | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c23bd2efb56..3f39d27decf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3296,11 +3296,15 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, u32 flags); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); + +typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, u32 flags); +bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op); + int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 549ac8a98b44..15ac20382aba 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -894,6 +894,13 @@ enum { XDP_FLAGS_SKB_MODE | \ XDP_FLAGS_DRV_MODE) +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, +}; + enum { IFLA_XDP_UNSPEC, IFLA_XDP_FD, -- cgit From 508541146af18e43072e41a31aa62fac2b01aac1 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 25 Apr 2017 10:39:57 +0300 Subject: net/mlx5: Use underlay QPN from the root name space Root flow table is dynamically changed by the underlying flow steering layer, and IPoIB/ULPs have no idea what will be the root flow table in the future, hence we need a dynamic infrastructure to move Underlay QPs with the root flow table. Fixes: b3ba51498bdd ("net/mlx5: Refactor create flow table method to accept underlay QP") Signed-off-by: Erez Shitrit Signed-off-by: Maor Gottlieb Signed-off-by: Yishai Hadas Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 1b166d2e19c5..b25e7baa273e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -109,7 +109,6 @@ struct mlx5_flow_table_attr { int max_fte; u32 level; u32 flags; - u32 underlay_qpn; }; struct mlx5_flow_table * @@ -167,4 +166,7 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); +int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn); +int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn); + #endif -- cgit