From e029f541039ff0768960ede62b946bcf4a163dec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Jan 2018 19:50:00 -0800 Subject: netdevsim: fix overflow on the error path Undo loop condition on the error path would cause the i counter to go below zero, if allocation failure happened with the first (i.e. 0th) element of the array. Fixes: 395cacb5f1a0 ("netdevsim: bpf: support fake map offload") Reported-by: Dan Carpenter Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: Daniel Borkmann --- drivers/net/netdevsim/bpf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index de73c1ff0939..75c25306d234 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -480,8 +480,7 @@ static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; - unsigned int i; - int err; + int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) @@ -518,7 +517,7 @@ nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) return 0; err_free: - while (--i) { + while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } -- cgit From 62a06994ced17f295fc51ea0815580ee7ccb668d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 29 Jan 2018 21:23:28 -0800 Subject: tools/bpf: permit selftests/bpf to be built in a different directory Fix a couple of issues at tools/testing/selftests/bpf/Makefile so the following command make -C tools/testing/selftests/bpf OUTPUT=/home/yhs/tmp can put the built results into a different directory. Also add the built binary test_tcpbpf_user in the .gitignore file. Fixes: 6882804c916b ("selftests/bpf: add a test for overlapping packet range checks") Fixes: 9d1f15941967 ("bpf: move cgroup_helpers from samples/bpf/ to tools/testing/selftesting/bpf/") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 1e09d77f1948..cc15af2e54fe 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -8,5 +8,6 @@ fixdep test_align test_dev_cgroup test_progs +test_tcpbpf_user test_verifier_log feature diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index bf05bc5e36e5..566d6adc172a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -27,7 +27,7 @@ TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ include ../lib.mk -BPFOBJ := $(OUTPUT)/libbpf.a $(OUTPUT)/cgroup_helpers.c +BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c $(TEST_GEN_PROGS): $(BPFOBJ) @@ -58,7 +58,7 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline -%.o: %.c +$(OUTPUT)/%.o: %.c $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@ -- cgit From 65073a67331de3d2cce35607807ddec284e75e81 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 31 Jan 2018 12:58:56 +0100 Subject: bpf: fix null pointer deref in bpf_prog_test_run_xdp syzkaller was able to generate the following XDP program ... (18) r0 = 0x0 (61) r5 = *(u32 *)(r1 +12) (04) (u32) r0 += (u32) 0 (95) exit ... and trigger a NULL pointer dereference in ___bpf_prog_run() via bpf_prog_test_run_xdp() where this was attempted to run. Reason is that recent xdp_rxq_info addition to XDP programs updated all drivers, but not bpf_prog_test_run_xdp(), where xdp_buff is set up. Thus when context rewriter does the deref on the netdev it's NULL at runtime. Fix it by using xdp_rxq from loopback dev. __netif_get_rx_queue() helper can also be reused in various other locations later on. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: syzbot+1eb094057b338eb1fc00@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 6 ++++++ net/bpf/test_run.c | 4 ++++ tools/testing/selftests/bpf/test_verifier.c | 14 ++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4c77f39ebd65..5eef6c8e2741 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3228,6 +3228,12 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev, } #endif +static inline struct netdev_rx_queue * +__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) +{ + return dev->_rx + rxq; +} + #ifdef CONFIG_SYSFS static inline unsigned int get_netdev_rx_queue_index( struct netdev_rx_queue *queue) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a86e6687026e..2ced48662c1f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -151,6 +151,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; void *data; @@ -165,6 +166,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); + xdp.rxq = &rxqueue->xdp_rxq; + retval = bpf_test_run(prog, &xdp, repeat, &duration); if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) size = xdp.data_end - xdp.data; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 697bd83de295..c0f16e93f9bd 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -7779,6 +7779,20 @@ static struct bpf_test tests[] = { .errstr = "unknown opcode d7", .result = REJECT, }, + { + "XDP, using ifindex from netdev", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, ingress_ifindex)), + BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 1, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + .retval = 1, + }, { "meta access, test1", .insns = { -- cgit From 6215ea6b7ee7bdaf308bd72f01dbf62dec70cdad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 1 Feb 2018 23:00:11 -0800 Subject: bpf: add documentation to compare clang "-target bpf" and default target The added documentation explains how generated codes may differ between clang bpf target and default target, and when to use each target. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- Documentation/bpf/bpf_devel_QA.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Documentation/bpf/bpf_devel_QA.txt b/Documentation/bpf/bpf_devel_QA.txt index cefef855dea4..84cbb302f2b5 100644 --- a/Documentation/bpf/bpf_devel_QA.txt +++ b/Documentation/bpf/bpf_devel_QA.txt @@ -516,4 +516,35 @@ A: LLVM has a -mcpu selector for the BPF back end in order to allow the By the way, the BPF kernel selftests run with -mcpu=probe for better test coverage. +Q: In some cases clang flag "-target bpf" is used but in other cases the + default clang target, which matches the underlying architecture, is used. + What is the difference and when I should use which? + +A: Although LLVM IR generation and optimization try to stay architecture + independent, "-target " still has some impact on generated code: + + - BPF program may recursively include header file(s) with file scope + inline assembly codes. The default target can handle this well, + while bpf target may fail if bpf backend assembler does not + understand these assembly codes, which is true in most cases. + + - When compiled without -g, additional elf sections, e.g., + .eh_frame and .rela.eh_frame, may be present in the object file + with default target, but not with bpf target. + + - The default target may turn a C switch statement into a switch table + lookup and jump operation. Since the switch table is placed + in the global readonly section, the bpf program will fail to load. + The bpf target does not support switch table optimization. + The clang option "-fno-jump-tables" can be used to disable + switch table generation. + + You should use default target when: + + - Your program includes a header file, e.g., ptrace.h, which eventually + pulls in some header files containing file scope host assembly codes. + - You can add "-fno-jump-tables" to work around the switch table issue. + + Otherwise, you can use bpf target. + Happy BPF hacking! -- cgit From 0911287ce32b14fbc8aab0083151d9b54254091c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Feb 2018 15:14:05 -0800 Subject: bpf: fix bpf_prog_array_copy_to_user() issues 1. move copy_to_user out of rcu section to fix the following issue: ./include/linux/rcupdate.h:302 Illegal context switch in RCU read-side critical section! stack backtrace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4592 rcu_preempt_sleep_check include/linux/rcupdate.h:301 [inline] ___might_sleep+0x385/0x470 kernel/sched/core.c:6079 __might_sleep+0x95/0x190 kernel/sched/core.c:6067 __might_fault+0xab/0x1d0 mm/memory.c:4532 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] bpf_prog_array_copy_to_user+0x217/0x4d0 kernel/bpf/core.c:1587 bpf_prog_array_copy_info+0x17b/0x1c0 kernel/bpf/core.c:1685 perf_event_query_prog_array+0x196/0x280 kernel/trace/bpf_trace.c:877 _perf_ioctl kernel/events/core.c:4737 [inline] perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4757 2. move *prog under rcu, since it's not ok to dereference it afterwards 3. in a rare case of prog array being swapped between bpf_prog_array_length() and bpf_prog_array_copy_to_user() calls make sure to copy zeros to user space, so the user doesn't walk over uninited prog_ids while kernel reported uattr->query.prog_cnt > 0 Reported-by: syzbot+7dbcd2d3b85f9b608b23@syzkaller.appspotmail.com Fixes: 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5f35f93dcab2..29ca9208dcfa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1576,25 +1576,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; - u32 i = 0, id; - + unsigned long err = 0; + u32 i = 0, *ids; + bool nospc; + + /* users of this function are doing: + * cnt = bpf_prog_array_length(); + * if (cnt > 0) + * bpf_prog_array_copy_to_user(..., cnt); + * so below kcalloc doesn't need extra cnt > 0 check, but + * bpf_prog_array_length() releases rcu lock and + * prog array could have been swapped with empty or larger array, + * so always copy 'cnt' prog_ids to the user. + * In a rare race the user will see zero prog_ids + */ + ids = kcalloc(cnt, sizeof(u32), GFP_USER); + if (!ids) + return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { if (*prog == &dummy_bpf_prog.prog) continue; - id = (*prog)->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) { - rcu_read_unlock(); - return -EFAULT; - } + ids[i] = (*prog)->aux->id; if (++i == cnt) { prog++; break; } } + nospc = !!(*prog); rcu_read_unlock(); - if (*prog) + err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); + kfree(ids); + if (err) + return -EFAULT; + if (nospc) return -ENOSPC; return 0; } -- cgit From dc2b9f19e3bdaa87a7c3d123b8bba8a42d96d942 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Tue, 30 Jan 2018 21:55:00 +0100 Subject: tools: add netlink.h and if_link.h in tools uapi The headers are necessary for libbpf compilation on system with older version of the headers. Signed-off-by: Eric Leblond Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/if_link.h | 943 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/netlink.h | 251 ++++++++++ tools/lib/bpf/Makefile | 6 + 3 files changed, 1200 insertions(+) create mode 100644 tools/include/uapi/linux/if_link.h create mode 100644 tools/include/uapi/linux/netlink.h diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h new file mode 100644 index 000000000000..8616131e2c61 --- /dev/null +++ b/tools/include/uapi/linux/if_link.h @@ -0,0 +1,943 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; /* total packets received */ + __u32 tx_packets; /* total packets transmitted */ + __u32 rx_bytes; /* total bytes received */ + __u32 tx_bytes; /* total bytes transmitted */ + __u32 rx_errors; /* bad packets received */ + __u32 tx_errors; /* packet transmit problems */ + __u32 rx_dropped; /* no space in linux buffers */ + __u32 tx_dropped; /* no space available in linux */ + __u32 multicast; /* multicast packets received */ + __u32 collisions; + + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; /* receiver ring buff overflow */ + __u32 rx_crc_errors; /* recved pkt with crc error */ + __u32 rx_frame_errors; /* recv'd frame alignment error */ + __u32 rx_fifo_errors; /* recv'r fifo overrun */ + __u32 rx_missed_errors; /* receiver missed packet */ + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; /* dropped, no handler found */ +}; + +/* The main device statistics structure */ +struct rtnl_link_stats64 { + __u64 rx_packets; /* total packets received */ + __u64 tx_packets; /* total packets transmitted */ + __u64 rx_bytes; /* total bytes received */ + __u64 tx_bytes; /* total bytes transmitted */ + __u64 rx_errors; /* bad packets received */ + __u64 tx_errors; /* packet transmit problems */ + __u64 rx_dropped; /* no space in linux buffers */ + __u64 tx_dropped; /* no space available in linux */ + __u64 multicast; /* multicast packets received */ + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; /* receiver ring buff overflow */ + __u64 rx_crc_errors; /* recved pkt with crc error */ + __u64 rx_frame_errors; /* recv'd frame alignment error */ + __u64 rx_fifo_errors; /* recv'r fifo overrun */ + __u64 rx_missed_errors; /* receiver missed packet */ + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + + __u64 rx_nohandler; /* dropped, no handler found */ +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* VXLAN section */ +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR section */ + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/tools/include/uapi/linux/netlink.h b/tools/include/uapi/linux/netlink.h new file mode 100644 index 000000000000..776bc92e9118 --- /dev/null +++ b/tools/include/uapi/linux/netlink.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Sending process port ID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 83714ca1f22b..e6d5f8d1477f 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -160,6 +160,12 @@ $(BPF_IN): force elfdep bpfdep @(test -f ../../include/uapi/linux/bpf_common.h -a -f ../../../include/uapi/linux/bpf_common.h && ( \ (diff -B ../../include/uapi/linux/bpf_common.h ../../../include/uapi/linux/bpf_common.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf_common.h' differs from latest version at 'include/uapi/linux/bpf_common.h'" >&2 )) || true + @(test -f ../../include/uapi/linux/netlink.h -a -f ../../../include/uapi/linux/netlink.h && ( \ + (diff -B ../../include/uapi/linux/netlink.h ../../../include/uapi/linux/netlink.h >/dev/null) || \ + echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/netlink.h' differs from latest version at 'include/uapi/linux/netlink.h'" >&2 )) || true + @(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \ + (diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \ + echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf $(OUTPUT)libbpf.so: $(BPF_IN) -- cgit From 949abbe88436c000cc63fce2bdfeb48b7d06a7df Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Tue, 30 Jan 2018 21:55:01 +0100 Subject: libbpf: add function to setup XDP Most of the code is taken from set_link_xdp_fd() in bpf_load.c and slightly modified to be library compliant. Signed-off-by: Eric Leblond Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.c | 2 + tools/lib/bpf/libbpf.h | 4 ++ 3 files changed, 128 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5128677e4117..bf2772566240 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -25,6 +25,12 @@ #include #include #include "bpf.h" +#include "libbpf.h" +#include "nlattr.h" +#include +#include +#include +#include /* * When building perf, unistd.h is overridden. __NR_bpf is @@ -46,7 +52,9 @@ # endif #endif +#ifndef min #define min(x, y) ((x) < (y) ? (x) : (y)) +#endif static inline __u64 ptr_to_u64(const void *ptr) { @@ -413,3 +421,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) return err; } + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + socklen_t addrlen; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + return -errno; + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != sa.nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + goto cleanup; + case NLMSG_DONE: + break; + default: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 30c776375118..c60122d3ea85 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -106,6 +106,8 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { [ERRCODE_OFFSET(PROG2BIG)] = "Program too big", [ERRCODE_OFFSET(KVER)] = "Incorrect kernel version", [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", + [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", + [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", }; int libbpf_strerror(int err, char *buf, size_t size) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6e20003109e0..e42f96900318 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -42,6 +42,8 @@ enum libbpf_errno { LIBBPF_ERRNO__PROG2BIG, /* Program too big */ LIBBPF_ERRNO__KVER, /* Incorrect kernel version */ LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ + LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ + LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ __LIBBPF_ERRNO__END, }; @@ -246,4 +248,6 @@ long libbpf_get_error(const void *ptr); int bpf_prog_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd); + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); #endif -- cgit From bbf48c18ee0cd18b53712aa09aefa29b64b3976e Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Tue, 30 Jan 2018 21:55:02 +0100 Subject: libbpf: add error reporting in XDP Parse netlink ext attribute to get the error message returned by the card. Code is partially take from libnl. We add netlink.h to the uapi include of tools. And we need to avoid include of userspace netlink header to have a successful build of sample so nlattr.h has a define to avoid the inclusion. Using a direct define could have been an issue as NLMSGERR_ATTR_MAX can change in the future. We also define SOL_NETLINK if not defined to avoid to have to copy socket.h for a fixed value. Signed-off-by: Eric Leblond Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- samples/bpf/Makefile | 2 +- tools/lib/bpf/Build | 2 +- tools/lib/bpf/bpf.c | 11 +++ tools/lib/bpf/nlattr.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/nlattr.h | 72 +++++++++++++++++++ 5 files changed, 272 insertions(+), 2 deletions(-) create mode 100644 tools/lib/bpf/nlattr.c create mode 100644 tools/lib/bpf/nlattr.h diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 64335bb94f9f..ec3fc8d88e87 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -45,7 +45,7 @@ hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp # Libbpf dependencies -LIBBPF := ../../tools/lib/bpf/bpf.o +LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o test_lru_dist-objs := test_lru_dist.o $(LIBBPF) diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index d8749756352d..64c679d67109 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o +libbpf-y := libbpf.o bpf.o nlattr.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index bf2772566240..9c88f6e4156d 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -32,6 +32,10 @@ #include #include +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + /* * When building perf, unistd.h is overridden. __NR_bpf is * required to be defined explicitly. @@ -436,6 +440,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) struct nlmsghdr *nh; struct nlmsgerr *err; socklen_t addrlen; + int one = 1; memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; @@ -445,6 +450,11 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) return -errno; } + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + fprintf(stderr, "Netlink error reporting not supported\n"); + } + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { ret = -errno; goto cleanup; @@ -521,6 +531,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) if (!err->error) continue; ret = err->error; + nla_dump_errormsg(nh); goto cleanup; case NLMSG_DONE: break; diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c new file mode 100644 index 000000000000..4719434278b2 --- /dev/null +++ b/tools/lib/bpf/nlattr.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: LGPL-2.1 + +/* + * NETLINK Netlink attributes + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation version 2.1 + * of the License. + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#include +#include "nlattr.h" +#include +#include +#include + +static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = { + [NLA_U8] = sizeof(uint8_t), + [NLA_U16] = sizeof(uint16_t), + [NLA_U32] = sizeof(uint32_t), + [NLA_U64] = sizeof(uint64_t), + [NLA_STRING] = 1, + [NLA_FLAG] = 0, +}; + +static int nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) +{ + int totlen = NLA_ALIGN(nla->nla_len); + + *remaining -= totlen; + return (struct nlattr *) ((char *) nla + totlen); +} + +static int nla_ok(const struct nlattr *nla, int remaining) +{ + return remaining >= sizeof(*nla) && + nla->nla_len >= sizeof(*nla) && + nla->nla_len <= remaining; +} + +static void *nla_data(const struct nlattr *nla) +{ + return (char *) nla + NLA_HDRLEN; +} + +static int nla_type(const struct nlattr *nla) +{ + return nla->nla_type & NLA_TYPE_MASK; +} + +static int validate_nla(struct nlattr *nla, int maxtype, + struct nla_policy *policy) +{ + struct nla_policy *pt; + unsigned int minlen = 0; + int type = nla_type(nla); + + if (type < 0 || type > maxtype) + return 0; + + pt = &policy[type]; + + if (pt->type > NLA_TYPE_MAX) + return 0; + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (nla_len(nla) < minlen) + return -1; + + if (pt->maxlen && nla_len(nla) > pt->maxlen) + return -1; + + if (pt->type == NLA_STRING) { + char *data = nla_data(nla); + if (data[nla_len(nla) - 1] != '\0') + return -1; + } + + return 0; +} + +static inline int nlmsg_len(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_len - NLMSG_HDRLEN; +} + +/** + * Create attribute index based on a stream of attributes. + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg head Head of attribute stream. + * @arg len Length of attribute stream. + * @arg policy Attribute validation policy. + * + * Iterates over the stream of attributes and stores a pointer to each + * attribute in the index array using the attribute type as index to + * the array. Attribute with a type greater than the maximum type + * specified will be silently ignored in order to maintain backwards + * compatibility. If \a policy is not NULL, the attribute will be + * validated using the specified policy. + * + * @see nla_validate + * @return 0 on success or a negative error code. + */ +static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + nla_for_each_attr(nla, head, len, rem) { + int type = nla_type(nla); + + if (type > maxtype) + continue; + + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + if (tb[type]) + fprintf(stderr, "Attribute of type %#x found multiple times in message, " + "previous attribute is being ignored.\n", type); + + tb[type] = nla; + } + + err = 0; +errout: + return err; +} + +/* dump netlink extended ack error message */ +int nla_dump_errormsg(struct nlmsghdr *nlh) +{ + struct nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { + [NLMSGERR_ATTR_MSG] = { .type = NLA_STRING }, + [NLMSGERR_ATTR_OFFS] = { .type = NLA_U32 }, + }; + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr; + struct nlmsgerr *err; + char *errmsg = NULL; + int hlen, alen; + + /* no TLVs, nothing to do here */ + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return 0; + + err = (struct nlmsgerr *)NLMSG_DATA(nlh); + hlen = sizeof(*err); + + /* if NLM_F_CAPPED is set then the inner err msg was capped */ + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + hlen += nlmsg_len(&err->msg); + + attr = (struct nlattr *) ((void *) err + hlen); + alen = nlh->nlmsg_len - hlen; + + if (nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) { + fprintf(stderr, + "Failed to parse extended error attributes\n"); + return 0; + } + + if (tb[NLMSGERR_ATTR_MSG]) + errmsg = (char *) nla_data(tb[NLMSGERR_ATTR_MSG]); + + fprintf(stderr, "Kernel error message: %s\n", errmsg); + + return 0; +} diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h new file mode 100644 index 000000000000..931a71f68f93 --- /dev/null +++ b/tools/lib/bpf/nlattr.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ + +/* + * NETLINK Netlink attributes + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation version 2.1 + * of the License. + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#ifndef __NLATTR_H +#define __NLATTR_H + +#include +#include +/* avoid multiple definition of netlink features */ +#define __LINUX_NETLINK_H + +/** + * Standard attribute types to specify validation policy + */ +enum { + NLA_UNSPEC, /**< Unspecified type, binary data chunk */ + NLA_U8, /**< 8 bit integer */ + NLA_U16, /**< 16 bit integer */ + NLA_U32, /**< 32 bit integer */ + NLA_U64, /**< 64 bit integer */ + NLA_STRING, /**< NUL terminated character string */ + NLA_FLAG, /**< Flag */ + NLA_MSECS, /**< Micro seconds (64bit) */ + NLA_NESTED, /**< Nested attributes */ + __NLA_TYPE_MAX, +}; + +#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) + +/** + * @ingroup attr + * Attribute validation policy. + * + * See section @core_doc{core_attr_parse,Attribute Parsing} for more details. + */ +struct nla_policy { + /** Type of attribute or NLA_UNSPEC */ + uint16_t type; + + /** Minimal length of payload required */ + uint16_t minlen; + + /** Maximal length of payload allowed */ + uint16_t maxlen; +}; + +/** + * @ingroup attr + * Iterate over a stream of attributes + * @arg pos loop counter, set to current attribute + * @arg head head of attribute stream + * @arg len length of attribute stream + * @arg rem initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nla_ok(pos, rem); \ + pos = nla_next(pos, &(rem))) + +int nla_dump_errormsg(struct nlmsghdr *nlh); + +#endif /* __NLATTR_H */ -- cgit From 6061a3d6720600c976b877c3ac1402b3ef0a8a55 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Tue, 30 Jan 2018 21:55:03 +0100 Subject: libbpf: add missing SPDX-License-Identifier Signed-off-by: Eric Leblond Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 2 ++ tools/lib/bpf/bpf.h | 2 ++ tools/lib/bpf/libbpf.c | 2 ++ tools/lib/bpf/libbpf.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9c88f6e4156d..592a58a2b681 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: LGPL-2.1 + /* * common eBPF ELF operations. * diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 9f44c196931e..8d18fb73d7fb 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ + /* * common eBPF ELF operations. * diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c60122d3ea85..71ddc481f349 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: LGPL-2.1 + /* * Common eBPF ELF object loading operations. * diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e42f96900318..f85906533cdd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1,3 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ + /* * Common eBPF ELF object loading operations. * -- cgit From b259c2ffd9b4812aa42a1d502eabd8c62a32d063 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Tue, 30 Jan 2018 21:55:04 +0100 Subject: samples/bpf: use bpf_set_link_xdp_fd Use bpf_set_link_xdp_fd instead of set_link_xdp_fd to remove some code duplication and benefit of netlink ext ack errors message. Signed-off-by: Eric Leblond Signed-off-by: Alexei Starovoitov --- samples/bpf/bpf_load.c | 102 ------------------------------------ samples/bpf/bpf_load.h | 2 +- samples/bpf/xdp1_user.c | 4 +- samples/bpf/xdp_redirect_cpu_user.c | 6 +-- samples/bpf/xdp_redirect_map_user.c | 8 +-- samples/bpf/xdp_redirect_user.c | 8 +-- samples/bpf/xdp_router_ipv4_user.c | 10 ++-- samples/bpf/xdp_rxq_info_user.c | 4 +- samples/bpf/xdp_tx_iptunnel_user.c | 6 +-- 9 files changed, 24 insertions(+), 126 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 242631aa4ea2..69806d74fa53 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -695,105 +695,3 @@ struct ksym *ksym_search(long key) return &syms[0]; } -int set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - printf("send to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - printf("recv from netlink: %s\n", strerror(errno)); - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != getpid()) { - printf("Wrong pid %d, expected %d\n", - nh->nlmsg_pid, getpid()); - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - printf("Wrong seq %d, expected %d\n", - nh->nlmsg_seq, seq); - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - printf("nlmsg error %s\n", strerror(-err->error)); - goto cleanup; - case NLMSG_DONE: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 7d57a4248893..453c200b389b 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -61,5 +61,5 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); -int set_link_xdp_fd(int ifindex, int fd, __u32 flags); +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); #endif diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index fdaefe91801d..b901ee2b3336 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -25,7 +25,7 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -116,7 +116,7 @@ int main(int argc, char **argv) signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 35fec9fecb57..23744a8aaf21 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -26,7 +26,7 @@ static const char *__doc__ = /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead * use bpf/libbpf.h), but cannot as (currently) needed for XDP - * attaching to a device via set_link_xdp_fd() + * attaching to a device via bpf_set_link_xdp_fd() */ #include "libbpf.h" #include "bpf_load.h" @@ -67,7 +67,7 @@ static void int_exit(int sig) "Interrupted: Removing XDP program on ifindex:%d device:%s\n", ifindex, ifname); if (ifindex > -1) - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(EXIT_OK); } @@ -682,7 +682,7 @@ int main(int argc, char **argv) /* Remove XDP program when program is interrupted */ signal(SIGINT, int_exit); - if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { fprintf(stderr, "link set xdp fd failed\n"); return EXIT_FAIL_XDP; } diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 978a532f0748..7eae07d7293e 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -34,9 +34,9 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex_in, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); if (ifindex_out_xdp_dummy_attached) - set_link_xdp_fd(ifindex_out, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); exit(0); } @@ -120,13 +120,13 @@ int main(int argc, char **argv) return 1; } - if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); return 1; } /* Loading dummy XDP prog on out-device */ - if (set_link_xdp_fd(ifindex_out, prog_fd[1], + if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1], (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { printf("WARN: link set xdp fd failed on %d\n", ifindex_out); ifindex_out_xdp_dummy_attached = false; diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 4475d837bf2c..d54e91eb6cbf 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -33,9 +33,9 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex_in, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); if (ifindex_out_xdp_dummy_attached) - set_link_xdp_fd(ifindex_out, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); exit(0); } @@ -114,13 +114,13 @@ int main(int argc, char **argv) return 1; } - if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); return 1; } /* Loading dummy XDP prog on out-device */ - if (set_link_xdp_fd(ifindex_out, prog_fd[1], + if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1], (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { printf("WARN: link set xdp fd failed on %d\n", ifindex_out); ifindex_out_xdp_dummy_attached = false; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 916462112d55..6296741c1fbd 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -37,7 +37,7 @@ static void int_exit(int sig) int i = 0; for (i = 0; i < total_ifindex; i++) - set_link_xdp_fd(ifindex_list[i], -1, flags); + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); exit(0); } @@ -49,7 +49,7 @@ static void close_and_exit(int sig) close(sock_arp); for (i = 0; i < total_ifindex; i++) - set_link_xdp_fd(ifindex_list[i], -1, flags); + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); exit(0); } @@ -183,7 +183,7 @@ static void read_route(struct nlmsghdr *nh, int nll) int i = 0; for (i = 0; i < total_ifindex; i++) - set_link_xdp_fd(ifindex_list[i], -1, flags); + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); exit(0); } assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0); @@ -633,12 +633,12 @@ int main(int ac, char **argv) } } for (i = 0; i < total_ifindex; i++) { - if (set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) { printf("link set xdp fd failed\n"); int recovery_index = i; for (i = 0; i < recovery_index; i++) - set_link_xdp_fd(ifindex_list[i], -1, flags); + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); return 1; } diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 32430e8b3a6a..478d95412de4 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -56,7 +56,7 @@ static void int_exit(int sig) "Interrupted: Removing XDP program on ifindex:%d device:%s\n", ifindex, ifname); if (ifindex > -1) - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(EXIT_OK); } @@ -521,7 +521,7 @@ int main(int argc, char **argv) /* Remove XDP program when program is interrupted */ signal(SIGINT, int_exit); - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { fprintf(stderr, "link set xdp fd failed\n"); return EXIT_FAIL_XDP; } diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 715cd12eaca5..f0a787268a87 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -30,7 +30,7 @@ static __u32 xdp_flags = 0; static void int_exit(int sig) { if (ifindex > -1) - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -254,14 +254,14 @@ int main(int argc, char **argv) } } - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } poll_stats(kill_after_s); - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); return 0; } -- cgit From a61a86f8db92923a2a4c857c49a795bcae754497 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sun, 4 Feb 2018 18:07:10 -0800 Subject: doc: Change the min default value of tcp_wmem/tcp_rmem. The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. And the tcp_wmem/tcp_rmem min default values are 4096. Fixes: bd68a2a854ad ("net: set SK_MEM_QUANTUM to 4096") Cc: Eric Dumazet Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3f2c40d8e6aa..a553d4e4a0fb 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -508,7 +508,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max min: Minimal size of receive buffer used by TCP sockets. It is guaranteed to each TCP socket, even under moderate memory pressure. - Default: 1 page + Default: 4K default: initial size of receive buffer used by TCP sockets. This value overrides net.core.rmem_default used by other protocols. @@ -667,7 +667,7 @@ tcp_window_scaling - BOOLEAN tcp_wmem - vector of 3 INTEGERs: min, default, max min: Amount of memory reserved for send buffers for TCP sockets. Each TCP socket has rights to use it due to fact of its birth. - Default: 1 page + Default: 4K default: initial size of send buffer used by TCP sockets. This value overrides net.core.wmem_default used by other protocols. -- cgit From 23ddd2612d0e9b8c61d3c70960f1b1e8b64a1c4b Mon Sep 17 00:00:00 2001 From: Jie Deng Date: Mon, 5 Feb 2018 11:31:27 +0800 Subject: dwc-xlgmac: remove Jie Deng as co-maintainer Jose Abreu is working on this driver and I will leave Synopsys soon. Thus it does not seem appropriate for me to be a co-maintainer anymore. Signed-off-by: Jie Deng Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b59a8cdfbe66..8903842a213f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13289,7 +13289,6 @@ F: include/linux/platform_data/dma-dw.h F: drivers/dma/dw/ SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER -M: Jie Deng M: Jose Abreu L: netdev@vger.kernel.org S: Supported -- cgit From 957d761cf91cdbb175ad7d8f5472336a4d54dbf2 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Mon, 5 Feb 2018 15:10:35 +0300 Subject: sctp: fix dst refcnt leak in sctp_v6_get_dst() When going through the bind address list in sctp_v6_get_dst() and the previously found address is better ('matchlen > bmatchlen'), the code continues to the next iteration without releasing currently held destination. Fix it by releasing 'bdst' before continue to the next iteration, and instead of introducing one more '!IS_ERR(bdst)' check for dst_release(), move the already existed one right after ip6_dst_lookup_flow(), i.e. we shouldn't proceed further if we get an error for the route lookup. Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6") Signed-off-by: Alexey Kodanev Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 5d4c15bf66d2..e35d4f73d2df 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); bdst = ip6_dst_lookup_flow(sk, fl6, final_p); - if (!IS_ERR(bdst) && - ipv6_chk_addr(dev_net(bdst->dev), + if (IS_ERR(bdst)) + continue; + + if (ipv6_chk_addr(dev_net(bdst->dev), &laddr->a.v6.sin6_addr, bdst->dev, 1)) { if (!IS_ERR_OR_NULL(dst)) dst_release(dst); @@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); - if (matchlen > bmatchlen) + if (matchlen > bmatchlen) { + dst_release(bdst); continue; + } if (!IS_ERR_OR_NULL(dst)) dst_release(dst); -- cgit From 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8 Mon Sep 17 00:00:00 2001 From: Tommi Rantala Date: Mon, 5 Feb 2018 21:48:14 +0200 Subject: sctp: fix dst refcnt leak in sctp_v4_get_dst Fix dst reference count leak in sctp_v4_get_dst() introduced in commit 410f03831 ("sctp: add routing output fallback"): When walking the address_list, successive ip_route_output_key() calls may return the same rt->dst with the reference incremented on each call. The code would not decrement the dst refcount when the dst pointer was identical from the previous iteration, causing the dst refcnt leak. Testcase: ip netns add TEST ip netns exec TEST ip link set lo up ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add dummy2 type dummy ip link set dev dummy0 netns TEST ip link set dev dummy1 netns TEST ip link set dev dummy2 netns TEST ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0 ip netns exec TEST ip link set dummy0 up ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1 ip netns exec TEST ip link set dummy1 up ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2 ip netns exec TEST ip link set dummy2 up ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3 ip netns del TEST In 4.4 and 4.9 kernels this results to: [ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1 [ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1 [ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1 [ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1 [ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1 [ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1 ... Fixes: 410f03831 ("sctp: add routing output fallback") Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses") Signed-off-by: Tommi Rantala Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/protocol.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6a38c2503649..91813e686c67 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (IS_ERR(rt)) continue; - if (!dst) - dst = &rt->dst; - /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { - if (&rt->dst != dst) + if (!dst) + dst = &rt->dst; + else dst_release(&rt->dst); continue; } - if (dst != &rt->dst) - dst_release(dst); + dst_release(dst); dst = &rt->dst; break; } -- cgit From 21a2545bbea02ce39e91d47c9e3ef0ccd0fb0b14 Mon Sep 17 00:00:00 2001 From: Desnes Augusto Nunes do Rosario Date: Mon, 5 Feb 2018 14:33:55 -0200 Subject: ibmvnic: fix empty firmware version and errors cleanup This patch makes sure that the firmware version is never NULL. Moreover, it also performs some cleanup on the error messages. Fixes: a107311d7fdf ("ibmvnic: fix firmware version when no firmware level has been provided by the VIOS server") Signed-off-by: Desnes A. Nunes do Rosario Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5caaa9033841..afaf29b201dc 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3286,7 +3286,7 @@ static void handle_vpd_rsp(union ibmvnic_crq *crq, struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; - unsigned char *substr = NULL, *ptr = NULL; + unsigned char *substr = NULL; u8 fw_level_len = 0; memset(adapter->fw_version, 0, 32); @@ -3306,10 +3306,6 @@ static void handle_vpd_rsp(union ibmvnic_crq *crq, substr = strnstr(adapter->vpd->buff, "RM", adapter->vpd->len); if (!substr) { dev_info(dev, "Warning - No FW level has been provided in the VPD buffer by the VIOS Server\n"); - ptr = strncpy((char *)adapter->fw_version, "N/A", - 3 * sizeof(char)); - if (!ptr) - dev_err(dev, "Failed to inform that firmware version is unavailable to the adapter\n"); goto complete; } @@ -3324,16 +3320,14 @@ static void handle_vpd_rsp(union ibmvnic_crq *crq, /* copy firmware version string from vpd into adapter */ if ((substr + 3 + fw_level_len) < (adapter->vpd->buff + adapter->vpd->len)) { - ptr = strncpy((char *)adapter->fw_version, - substr + 3, fw_level_len); - - if (!ptr) - dev_err(dev, "Failed to isolate FW level string\n"); + strncpy((char *)adapter->fw_version, substr + 3, fw_level_len); } else { dev_info(dev, "FW substr extrapolated VPD buff\n"); } complete: + if (adapter->fw_version[0] == '\0') + strncpy((char *)adapter->fw_version, "N/A", 3 * sizeof(char)); complete(&adapter->fw_done); } -- cgit From a56c69803f5a2c1cab0228cf1aebf76821ace965 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 5 Feb 2018 19:17:23 +0100 Subject: net: phy: Handle not having GPIO enabled in the kernel If CONFIG_GPIOLIB is disabled, fwnode_get_named_gpiod() becomes a stub function, which return -ENOSYS. Handle this in the same way as -ENOENT, i.e. assume there is no GPIO used to reset the PHYs. Reported-by: Christian Zigotzky Tested-by: Christian Zigotzky Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 88272b3ac2e2..24b5511222c8 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -56,7 +56,8 @@ static int mdiobus_register_gpiod(struct mdio_device *mdiodev) gpiod = fwnode_get_named_gpiod(&mdiodev->dev.of_node->fwnode, "reset-gpios", 0, GPIOD_OUT_LOW, "PHY reset"); - if (PTR_ERR(gpiod) == -ENOENT) + if (PTR_ERR(gpiod) == -ENOENT || + PTR_ERR(gpiod) == -ENOSYS) gpiod = NULL; else if (IS_ERR(gpiod)) return PTR_ERR(gpiod); -- cgit From a3276892db7a588bedc33168e502572008f714a9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 5 Feb 2018 21:10:01 +0100 Subject: net: amd-xgbe: fix comparison to bitshift when dealing with a mask Due to a typo, the mask was destroyed by a comparison instead of a bit shift. Signed-off-by: Wolfram Sang Acked-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 7a3ebfd236f5..100adee778df 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -595,7 +595,7 @@ isr_done: reissue_mask = 1 << 0; if (!pdata->per_channel_irq) - reissue_mask |= 0xffff < 4; + reissue_mask |= 0xffff << 4; XP_IOWRITE(pdata, XP_INT_REISSUE_EN, reissue_mask); } -- cgit From d7cdee5ea8d28ae1b6922deb0c1badaa3aa0ef8c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 5 Feb 2018 22:23:01 +0100 Subject: cls_u32: fix use after free in u32_destroy_key() Li Shuang reported an Oops with cls_u32 due to an use-after-free in u32_destroy_key(). The use-after-free can be triggered with: dev=lo tc qdisc add dev $dev root handle 1: htb default 10 tc filter add dev $dev parent 1: prio 5 handle 1: protocol ip u32 divisor 256 tc filter add dev $dev protocol ip parent 1: prio 5 u32 ht 800:: match ip dst\ 10.0.0.0/8 hashkey mask 0x0000ff00 at 16 link 1: tc qdisc del dev $dev root Which causes the following kasan splat: ================================================================== BUG: KASAN: use-after-free in u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] Read of size 4 at addr ffff881b83dae618 by task kworker/u48:5/571 CPU: 17 PID: 571 Comm: kworker/u48:5 Not tainted 4.15.0+ #87 Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016 Workqueue: tc_filter_workqueue u32_delete_key_freepf_work [cls_u32] Call Trace: dump_stack+0xd6/0x182 ? dma_virt_map_sg+0x22e/0x22e print_address_description+0x73/0x290 kasan_report+0x277/0x360 ? u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] u32_delete_key_freepf_work+0x1c/0x30 [cls_u32] process_one_work+0xae0/0x1c80 ? sched_clock+0x5/0x10 ? pwq_dec_nr_in_flight+0x3c0/0x3c0 ? _raw_spin_unlock_irq+0x29/0x40 ? trace_hardirqs_on_caller+0x381/0x570 ? _raw_spin_unlock_irq+0x29/0x40 ? finish_task_switch+0x1e5/0x760 ? finish_task_switch+0x208/0x760 ? preempt_notifier_dec+0x20/0x20 ? __schedule+0x839/0x1ee0 ? check_noncircular+0x20/0x20 ? firmware_map_remove+0x73/0x73 ? find_held_lock+0x39/0x1c0 ? worker_thread+0x434/0x1820 ? lock_contended+0xee0/0xee0 ? lock_release+0x1100/0x1100 ? init_rescuer.part.16+0x150/0x150 ? retint_kernel+0x10/0x10 worker_thread+0x216/0x1820 ? process_one_work+0x1c80/0x1c80 ? lock_acquire+0x1a5/0x540 ? lock_downgrade+0x6b0/0x6b0 ? sched_clock+0x5/0x10 ? lock_release+0x1100/0x1100 ? compat_start_thread+0x80/0x80 ? do_raw_spin_trylock+0x190/0x190 ? _raw_spin_unlock_irq+0x29/0x40 ? trace_hardirqs_on_caller+0x381/0x570 ? _raw_spin_unlock_irq+0x29/0x40 ? finish_task_switch+0x1e5/0x760 ? finish_task_switch+0x208/0x760 ? preempt_notifier_dec+0x20/0x20 ? __schedule+0x839/0x1ee0 ? kmem_cache_alloc_trace+0x143/0x320 ? firmware_map_remove+0x73/0x73 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? find_held_lock+0x39/0x1c0 ? schedule+0xf3/0x3b0 ? lock_downgrade+0x6b0/0x6b0 ? __schedule+0x1ee0/0x1ee0 ? do_wait_intr_irq+0x340/0x340 ? do_raw_spin_trylock+0x190/0x190 ? _raw_spin_unlock_irqrestore+0x32/0x60 ? process_one_work+0x1c80/0x1c80 ? process_one_work+0x1c80/0x1c80 kthread+0x312/0x3d0 ? kthread_create_worker_on_cpu+0xc0/0xc0 ret_from_fork+0x3a/0x50 Allocated by task 1688: kasan_kmalloc+0xa0/0xd0 __kmalloc+0x162/0x380 u32_change+0x1220/0x3c9e [cls_u32] tc_ctl_tfilter+0x1ba6/0x2f80 rtnetlink_rcv_msg+0x4f0/0x9d0 netlink_rcv_skb+0x124/0x320 netlink_unicast+0x430/0x600 netlink_sendmsg+0x8fa/0xd60 sock_sendmsg+0xb1/0xe0 ___sys_sendmsg+0x678/0x980 __sys_sendmsg+0xc4/0x210 do_syscall_64+0x232/0x7f0 return_from_SYSCALL_64+0x0/0x75 Freed by task 112: kasan_slab_free+0x71/0xc0 kfree+0x114/0x320 rcu_process_callbacks+0xc3f/0x1600 __do_softirq+0x2bf/0xc06 The buggy address belongs to the object at ffff881b83dae600 which belongs to the cache kmalloc-4096 of size 4096 The buggy address is located 24 bytes inside of 4096-byte region [ffff881b83dae600, ffff881b83daf600) The buggy address belongs to the page: page:ffffea006e0f6a00 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 flags: 0x17ffffc0008100(slab|head) raw: 0017ffffc0008100 0000000000000000 0000000000000000 0000000100070007 raw: dead000000000100 dead000000000200 ffff880187c0e600 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff881b83dae500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff881b83dae580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff881b83dae600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff881b83dae680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff881b83dae700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== The problem is that the htnode is freed before the linked knodes and the latter will try to access the first at u32_destroy_key() time. This change addresses the issue using the htnode refcnt to guarantee the correct free order. While at it also add a RCU annotation, to keep sparse happy. v1 -> v2: use rtnl_derefence() instead of RCU read locks v2 -> v3: - don't check refcnt in u32_destroy_hnode() - cleaned-up u32_destroy() implementation - cleaned-up code comment v3 -> v4: - dropped unneeded comment Reported-by: Li Shuang Fixes: c0d378ef1266 ("net_sched: use tcf_queue_work() in u32 filter") Signed-off-by: Paolo Abeni Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index e3c5e390ec23..6311a548046b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -398,10 +398,12 @@ static int u32_init(struct tcf_proto *tp) static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); + tcf_exts_destroy(&n->exts); tcf_exts_put_net(&n->exts); - if (n->ht_down) - n->ht_down->refcnt--; + if (ht && --ht->refcnt == 0) + kfree(ht); #ifdef CONFIG_CLS_U32_PERF if (free_pf) free_percpu(n->pf); @@ -659,16 +661,15 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) hlist_del(&tp_c->hnode); - for (ht = rtnl_dereference(tp_c->hlist); - ht; - ht = rtnl_dereference(ht->next)) { - ht->refcnt--; - u32_clear_hnode(tp, ht, extack); - } - while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { + u32_clear_hnode(tp, ht, extack); RCU_INIT_POINTER(tp_c->hlist, ht->next); - kfree_rcu(ht, rcu); + + /* u32_destroy_key() will later free ht for us, if it's + * still referenced by some knode + */ + if (--ht->refcnt == 0) + kfree_rcu(ht, rcu); } idr_destroy(&tp_c->handle_idr); -- cgit From 3df1928302950dfa728ab2eade28eea0da291567 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 5 Feb 2018 13:35:34 -0800 Subject: net: erspan: fix metadata extraction Commit d350a823020e ("net: erspan: create erspan metadata uapi header") moves the erspan 'version' in front of the 'struct erspan_md2' for later extensibility reason. This breaks the existing erspan metadata extraction code because the erspan_md2 then has a 4-byte offset to between the erspan_metadata and erspan_base_hdr. This patch fixes it. Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel") Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode") Fixes: 1d7e2ed22f8d ("net: erspan: refactor existing erspan code") Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 26 +++++++++++++------------- net/ipv4/ip_gre.c | 5 ++++- net/ipv6/ip6_gre.c | 6 ++++-- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/erspan.h b/include/net/erspan.h index 5daa4866412b..d044aa60cc76 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -159,13 +159,13 @@ static inline void erspan_build_header(struct sk_buff *skb, struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; - struct erspan_metadata *ersmd; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 tos; + __be32 *idx; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + @@ -195,8 +195,8 @@ static inline void erspan_build_header(struct sk_buff *skb, set_session_id(ershdr, id); /* Build metadata */ - ersmd = (struct erspan_metadata *)(ershdr + 1); - ersmd->u.index = htonl(index & INDEX_MASK); + idx = (__be32 *)(ershdr + 1); + *idx = htonl(index & INDEX_MASK); } /* ERSPAN GRA: timestamp granularity @@ -225,7 +225,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, { struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; - struct erspan_metadata *md; + struct erspan_md2 *md2; struct qtag_prefix { __be16 eth_type; __be16 tci; @@ -261,15 +261,15 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, set_session_id(ershdr, id); /* Build metadata */ - md = (struct erspan_metadata *)(ershdr + 1); - md->u.md2.timestamp = erspan_get_timestamp(); - md->u.md2.sgt = htons(sgt); - md->u.md2.p = 1; - md->u.md2.ft = 0; - md->u.md2.dir = direction; - md->u.md2.gra = gra; - md->u.md2.o = 0; - set_hwid(&md->u.md2, hwid); + md2 = (struct erspan_md2 *)(ershdr + 1); + md2->timestamp = erspan_get_timestamp(); + md2->sgt = htons(sgt); + md2->p = 1; + md2->ft = 0; + md2->dir = direction; + md2->gra = gra; + md2->o = 0; + set_hwid(md2, hwid); } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6ec670fbbbdd..9b50eddd1882 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -261,6 +261,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; + struct erspan_md2 *md2; int ver; int len; @@ -313,8 +314,10 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, return PACKET_REJECT; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - memcpy(md, pkt_md, sizeof(*md)); md->version = ver; + md2 = &md->u.md2; + memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : + ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 05f070e123e4..50913dbd0612 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -505,6 +505,7 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, struct erspan_base_hdr *ershdr; struct erspan_metadata *pkt_md; const struct ipv6hdr *ipv6h; + struct erspan_md2 *md2; struct ip6_tnl *tunnel; u8 ver; @@ -551,9 +552,10 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); - - memcpy(md, pkt_md, sizeof(*md)); md->version = ver; + md2 = &md->u.md2; + memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : + ERSPAN_V2_MDSIZE); info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); -- cgit From 39f57f6799cdd437277122d4cd1c470c08f527c0 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 5 Feb 2018 13:35:35 -0800 Subject: net: erspan: fix erspan config overwrite When an erspan tunnel device receives an erpsan packet with different tunnel metadata (ex: version, index, hwid, direction), existing code overwrites the tunnel device's erspan configuration with the received packet's metadata. The patch fixes it. Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel") Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre") Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode") Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support") Signed-off-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 9 --------- net/ipv6/ip6_gre.c | 9 --------- 2 files changed, 18 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b50eddd1882..45d97e9b2759 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -322,15 +322,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); - } else { - tunnel->erspan_ver = ver; - if (ver == 1) { - tunnel->index = ntohl(pkt_md->u.index); - } else { - tunnel->dir = pkt_md->u.md2.dir; - tunnel->hwid = get_hwid(&pkt_md->u.md2); - } - } skb_reset_mac_header(skb); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 50913dbd0612..3c353125546d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -562,15 +562,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } else { - tunnel->parms.erspan_ver = ver; - - if (ver == 1) { - tunnel->parms.index = ntohl(pkt_md->u.index); - } else { - tunnel->parms.dir = pkt_md->u.md2.dir; - tunnel->parms.hwid = get_hwid(&pkt_md->u.md2); - } - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } -- cgit From 9c33ca4317c81d9a5d030bbc60aeb2d16edf172b Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 5 Feb 2018 13:35:36 -0800 Subject: sample/bpf: fix erspan metadata The commit c69de58ba84f ("net: erspan: use bitfield instead of mask and offset") changes the erspan header to use bitfield, and commit d350a823020e ("net: erspan: create erspan metadata uapi header") creates a uapi header file. The above two commit breaks the current erspan test. This patch fixes it by adapting the above two changes. Fixes: ac80c2a165af ("samples/bpf: add erspan v2 sample code") Fixes: ef88f89c830f ("samples/bpf: extend test_tunnel_bpf.sh with ERSPAN") Signed-off-by: William Tu Signed-off-by: David S. Miller --- samples/bpf/tcbpf2_kern.c | 41 ++++++++++++++++------------------------- samples/bpf/test_tunnel_bpf.sh | 4 ++-- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index f6bbf8f50da3..efdc16d195ff 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "bpf_helpers.h" #include "bpf_endian.h" @@ -35,24 +36,10 @@ struct geneve_opt { u8 opt_data[8]; /* hard-coded to 8 byte */ }; -struct erspan_md2 { - __be32 timestamp; - __be16 sgt; - __be16 flags; -}; - struct vxlan_metadata { u32 gbp; }; -struct erspan_metadata { - union { - __be32 index; - struct erspan_md2 md2; - } u; - int version; -}; - SEC("gre_set_tunnel") int _gre_set_tunnel(struct __sk_buff *skb) { @@ -156,13 +143,15 @@ int _erspan_set_tunnel(struct __sk_buff *skb) __builtin_memset(&md, 0, sizeof(md)); #ifdef ERSPAN_V1 md.version = 1; - md.u.index = htonl(123); + md.u.index = bpf_htonl(123); #else u8 direction = 1; - u16 hwid = 7; + u8 hwid = 7; md.version = 2; - md.u.md2.flags = htons((direction << 3) | (hwid << 4)); + md.u.md2.dir = direction; + md.u.md2.hwid = hwid & 0xf; + md.u.md2.hwid_upper = (hwid >> 4) & 0x3; #endif ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); @@ -207,9 +196,9 @@ int _erspan_get_tunnel(struct __sk_buff *skb) char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; bpf_trace_printk(fmt2, sizeof(fmt2), - (ntohs(md.u.md2.flags) >> 3) & 0x1, - (ntohs(md.u.md2.flags) >> 4) & 0x3f, - bpf_ntohl(md.u.md2.timestamp)); + md.u.md2.dir, + (md.u.md2.hwid_upper << 4) + md.u.md2.hwid, + bpf_ntohl(md.u.md2.timestamp)); #endif return TC_ACT_OK; @@ -242,10 +231,12 @@ int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb) md.version = 1; #else u8 direction = 0; - u16 hwid = 17; + u8 hwid = 17; md.version = 2; - md.u.md2.flags = htons((direction << 3) | (hwid << 4)); + md.u.md2.dir = direction; + md.u.md2.hwid = hwid & 0xf; + md.u.md2.hwid_upper = (hwid >> 4) & 0x3; #endif ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); @@ -290,9 +281,9 @@ int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb) char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; bpf_trace_printk(fmt2, sizeof(fmt2), - (ntohs(md.u.md2.flags) >> 3) & 0x1, - (ntohs(md.u.md2.flags) >> 4) & 0x3f, - bpf_ntohl(md.u.md2.timestamp)); + md.u.md2.dir, + (md.u.md2.hwid_upper << 4) + md.u.md2.hwid, + bpf_ntohl(md.u.md2.timestamp)); #endif return TC_ACT_OK; diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index ae7f7c38309b..43ce049996ee 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -68,7 +68,7 @@ function add_erspan_tunnel { ip netns exec at_ns0 \ ip link add dev $DEV_NS type $TYPE seq key 2 \ local 172.16.1.100 remote 172.16.1.200 \ - erspan_ver 2 erspan_dir 1 erspan_hwid 3 + erspan_ver 2 erspan_dir egress erspan_hwid 3 fi ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 @@ -97,7 +97,7 @@ function add_ip6erspan_tunnel { ip netns exec at_ns0 \ ip link add dev $DEV_NS type $TYPE seq key 2 \ local ::11 remote ::22 \ - erspan_ver 2 erspan_dir 1 erspan_hwid 7 + erspan_ver 2 erspan_dir egress erspan_hwid 7 fi ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 ip netns exec at_ns0 ip link set dev $DEV_NS up -- cgit From 703f578a353d0735961b7fd7e2589532dfa9bf11 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 5 Feb 2018 15:29:27 -0800 Subject: nfp: fix kdoc warnings on nested structures Commit 84ce5b987783 ("scripts: kernel-doc: improve nested logic to handle multiple identifiers") improved the handling of nested structure definitions in scripts/kernel-doc, and changed the expected format of documentation. This causes new warnings to appear on W=1 builds. Only comment changes. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.h | 24 ++++++------ .../ethernet/netronome/nfp/flower/tunnel_conf.c | 10 ++--- drivers/net/ethernet/netronome/nfp/nfp_net.h | 6 ++- .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h | 43 +++++++++++----------- .../ethernet/netronome/nfp/nfpcore/nfp_resource.c | 21 ++++++----- 5 files changed, 55 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 424fe8338105..054df3dc0698 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -112,22 +112,22 @@ enum pkt_vec { * @map_elems_in_use: number of elements allocated to offloaded maps * * @adjust_head: adjust head capability - * @flags: extra flags for adjust head - * @off_min: minimal packet offset within buffer required - * @off_max: maximum packet offset within buffer required - * @guaranteed_sub: amount of negative adjustment guaranteed possible - * @guaranteed_add: amount of positive adjustment guaranteed possible + * @adjust_head.flags: extra flags for adjust head + * @adjust_head.off_min: minimal packet offset within buffer required + * @adjust_head.off_max: maximum packet offset within buffer required + * @adjust_head.guaranteed_sub: negative adjustment guaranteed possible + * @adjust_head.guaranteed_add: positive adjustment guaranteed possible * * @maps: map capability - * @types: supported map types - * @max_maps: max number of maps supported - * @max_elems: max number of entries in each map - * @max_key_sz: max size of map key - * @max_val_sz: max size of map value - * @max_elem_sz: max size of map entry (key + value) + * @maps.types: supported map types + * @maps.max_maps: max number of maps supported + * @maps.max_elems: max number of entries in each map + * @maps.max_key_sz: max size of map key + * @maps.max_val_sz: max size of map value + * @maps.max_elem_sz: max size of map entry (key + value) * * @helpers: helper addressess for various calls - * @map_lookup: map lookup helper address + * @helpers.map_lookup: map lookup helper address */ struct nfp_app_bpf { struct nfp_app *app; diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index b03f22f29612..ec524d97869d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -50,9 +50,9 @@ * @seq: sequence number of the message * @count: number of tunnels report in message * @flags: options part of the request - * @ipv4: dest IPv4 address of active route - * @egress_port: port the encapsulated packet egressed - * @extra: reserved for future use + * @tun_info.ipv4: dest IPv4 address of active route + * @tun_info.egress_port: port the encapsulated packet egressed + * @tun_info.extra: reserved for future use * @tun_info: tunnels that have sent traffic in reported period */ struct nfp_tun_active_tuns { @@ -132,8 +132,8 @@ struct nfp_ipv4_addr_entry { * struct nfp_tun_mac_addr - configure MAC address of tunnel EP on NFP * @reserved: reserved for future use * @count: number of MAC addresses in the message - * @index: index of MAC address in the lookup table - * @addr: interface MAC address + * @addresses.index: index of MAC address in the lookup table + * @addresses.addr: interface MAC address * @addresses: series of MACs to offload */ struct nfp_tun_mac_addr { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index d88eda9707e6..787df47ec430 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -193,7 +193,8 @@ struct nfp_net_tx_desc { /** * struct nfp_net_tx_buf - software TX buffer descriptor - * @skb: sk_buff associated with this buffer + * @skb: normal ring, sk_buff associated with this buffer + * @frag: XDP ring, page frag associated with this buffer * @dma_addr: DMA mapping address of the buffer * @fidx: Fragment index (-1 for the head and [0..nr_frags-1] for frags) * @pkt_cnt: Number of packets to be produced out of the skb associated @@ -377,6 +378,9 @@ struct nfp_net_rx_ring { * struct nfp_net_r_vector - Per ring interrupt vector configuration * @nfp_net: Backpointer to nfp_net structure * @napi: NAPI structure for this ring vec + * @tasklet: ctrl vNIC, tasklet for servicing the r_vec + * @queue: ctrl vNIC, send queue + * @lock: ctrl vNIC, r_vec lock protects @queue * @tx_ring: Pointer to TX ring * @rx_ring: Pointer to RX ring * @xdp_ring: Pointer to an extra TX ring for XDP diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h index e983c9d7f86c..f23d9e06f097 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h @@ -98,28 +98,29 @@ enum nfp_eth_fec { * @max_index: max of @index fields of all @ports * @ports: table of ports * - * @eth_index: port index according to legacy ethX numbering - * @index: chip-wide first channel index - * @nbi: NBI index - * @base: first channel index (within NBI) - * @lanes: number of channels - * @speed: interface speed (in Mbps) - * @interface: interface (module) plugged in - * @media: media type of the @interface - * @fec: forward error correction mode - * @aneg: auto negotiation mode - * @mac_addr: interface MAC address - * @label_port: port id - * @label_subport: id of interface within port (for split ports) - * @enabled: is enabled? - * @tx_enabled: is TX enabled? - * @rx_enabled: is RX enabled? - * @override_changed: is media reconfig pending? + * @ports.eth_index: port index according to legacy ethX numbering + * @ports.index: chip-wide first channel index + * @ports.nbi: NBI index + * @ports.base: first channel index (within NBI) + * @ports.lanes: number of channels + * @ports.speed: interface speed (in Mbps) + * @ports.interface: interface (module) plugged in + * @ports.media: media type of the @interface + * @ports.fec: forward error correction mode + * @ports.aneg: auto negotiation mode + * @ports.mac_addr: interface MAC address + * @ports.label_port: port id + * @ports.label_subport: id of interface within port (for split ports) + * @ports.enabled: is enabled? + * @ports.tx_enabled: is TX enabled? + * @ports.rx_enabled: is RX enabled? + * @ports.override_changed: is media reconfig pending? * - * @port_type: one of %PORT_* defines for ethtool - * @port_lanes: total number of lanes on the port (sum of lanes of all subports) - * @is_split: is interface part of a split port - * @fec_modes_supported: bitmap of FEC modes supported + * @ports.port_type: one of %PORT_* defines for ethtool + * @ports.port_lanes: total number of lanes on the port (sum of lanes of all + * subports) + * @ports.is_split: is interface part of a split port + * @ports.fec_modes_supported: bitmap of FEC modes supported */ struct nfp_eth_table { unsigned int count; diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c index b1dd13ff282b..7e14725055c7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c @@ -56,16 +56,17 @@ /** * struct nfp_resource_entry - Resource table entry - * @owner: NFP CPP Lock, interface owner - * @key: NFP CPP Lock, posix_crc32(name, 8) - * @region: Memory region descriptor - * @name: ASCII, zero padded name - * @reserved - * @cpp_action: CPP Action - * @cpp_token: CPP Token - * @cpp_target: CPP Target ID - * @page_offset: 256-byte page offset into target's CPP address - * @page_size: size, in 256-byte pages + * @mutex: NFP CPP Lock + * @mutex.owner: NFP CPP Lock, interface owner + * @mutex.key: NFP CPP Lock, posix_crc32(name, 8) + * @region: Memory region descriptor + * @region.name: ASCII, zero padded name + * @region.reserved: padding + * @region.cpp_action: CPP Action + * @region.cpp_token: CPP Token + * @region.cpp_target: CPP Target ID + * @region.page_offset: 256-byte page offset into target's CPP address + * @region.page_size: size, in 256-byte pages */ struct nfp_resource_entry { struct nfp_resource_entry_mutex { -- cgit From 2c0aa08631b86a4678dbc93b9caa5248014b4458 Mon Sep 17 00:00:00 2001 From: Guanglei Li Date: Tue, 6 Feb 2018 10:43:21 +0800 Subject: RDS: IB: Fix null pointer issue Scenario: 1. Port down and do fail over 2. Ap do rds_bind syscall PID: 47039 TASK: ffff89887e2fe640 CPU: 47 COMMAND: "kworker/u:6" #0 [ffff898e35f159f0] machine_kexec at ffffffff8103abf9 #1 [ffff898e35f15a60] crash_kexec at ffffffff810b96e3 #2 [ffff898e35f15b30] oops_end at ffffffff8150f518 #3 [ffff898e35f15b60] no_context at ffffffff8104854c #4 [ffff898e35f15ba0] __bad_area_nosemaphore at ffffffff81048675 #5 [ffff898e35f15bf0] bad_area_nosemaphore at ffffffff810487d3 #6 [ffff898e35f15c00] do_page_fault at ffffffff815120b8 #7 [ffff898e35f15d10] page_fault at ffffffff8150ea95 [exception RIP: unknown or invalid address] RIP: 0000000000000000 RSP: ffff898e35f15dc8 RFLAGS: 00010282 RAX: 00000000fffffffe RBX: ffff889b77f6fc00 RCX:ffffffff81c99d88 RDX: 0000000000000000 RSI: ffff896019ee08e8 RDI:ffff889b77f6fc00 RBP: ffff898e35f15df0 R8: ffff896019ee08c8 R9:0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12:ffff896019ee08c0 R13: ffff889b77f6fe68 R14: ffffffff81c99d80 R15: ffffffffa022a1e0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffff898e35f15dc8] cma_ndev_work_handler at ffffffffa022a228 [rdma_cm] #9 [ffff898e35f15df8] process_one_work at ffffffff8108a7c6 #10 [ffff898e35f15e58] worker_thread at ffffffff8108bda0 #11 [ffff898e35f15ee8] kthread at ffffffff81090fe6 PID: 45659 TASK: ffff880d313d2500 CPU: 31 COMMAND: "oracle_45659_ap" #0 [ffff881024ccfc98] __schedule at ffffffff8150bac4 #1 [ffff881024ccfd40] schedule at ffffffff8150c2cf #2 [ffff881024ccfd50] __mutex_lock_slowpath at ffffffff8150cee7 #3 [ffff881024ccfdc0] mutex_lock at ffffffff8150cdeb #4 [ffff881024ccfde0] rdma_destroy_id at ffffffffa022a027 [rdma_cm] #5 [ffff881024ccfe10] rds_ib_laddr_check at ffffffffa0357857 [rds_rdma] #6 [ffff881024ccfe50] rds_trans_get_preferred at ffffffffa0324c2a [rds] #7 [ffff881024ccfe80] rds_bind at ffffffffa031d690 [rds] #8 [ffff881024ccfeb0] sys_bind at ffffffff8142a670 PID: 45659 PID: 47039 rds_ib_laddr_check /* create id_priv with a null event_handler */ rdma_create_id rdma_bind_addr cma_acquire_dev /* add id_priv to cma_dev->id_list */ cma_attach_to_dev cma_ndev_work_handler /* event_hanlder is null */ id_priv->id.event_handler Signed-off-by: Guanglei Li Signed-off-by: Honglei Wang Reviewed-by: Junxiao Bi Reviewed-by: Yanjun Zhu Reviewed-by: Leon Romanovsky Acked-by: Santosh Shilimkar Acked-by: Doug Ledford Signed-off-by: David S. Miller --- net/rds/ib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index b2a5067b4afe..ff0c98096af1 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -345,7 +345,8 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, + NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); -- cgit From 3df40aad1a864af124bd50a1371ef16089ac9af2 Mon Sep 17 00:00:00 2001 From: Suresh Reddy Date: Tue, 6 Feb 2018 08:52:41 -0500 Subject: be2net: Fix HW stall issue in Lancer Lancer HW cannot handle a TSO packet with a single segment. Disable TSO/GSO for such packets. Signed-off-by: Suresh Reddy Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index d81e2d37bc3d..286d591c574e 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5104,9 +5104,12 @@ static netdev_features_t be_features_check(struct sk_buff *skb, features &= ~NETIF_F_TSO6; /* Lancer cannot handle the packet with MSS less than 256. + * Also it can't handle a TSO packet with a single segment * Disable the GSO support in such cases */ - if (lancer_chip(adapter) && skb_shinfo(skb)->gso_size < 256) + if (lancer_chip(adapter) && + (skb_shinfo(skb)->gso_size < 256 || + skb_shinfo(skb)->gso_segs == 1)) features &= ~NETIF_F_GSO_MASK; } -- cgit From ffc39620102dfe62711fadb9a297b66aee816013 Mon Sep 17 00:00:00 2001 From: Suresh Reddy Date: Tue, 6 Feb 2018 08:52:42 -0500 Subject: be2net: Handle transmit completion errors in Lancer If the driver receives a TX CQE with status as 0x1 or 0x9 or 0xb, the completion indexes should not be used. The driver must stop consuming CQEs from this TXQ/CQ. The TXQ from this point on-wards to be in a bad state. Driver should destroy and recreate the TXQ. 0x1: LANCER_TX_COMP_LSO_ERR 0x9 LANCER_TX_COMP_SGE_ERR 0xb: LANCER_TX_COMP_PARITY_ERR Reset the adapter if driver sees this error in TX completion. Also adding sge error counter in ethtool stats. Signed-off-by: Suresh Reddy Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be.h | 7 +- drivers/net/ethernet/emulex/benet/be_ethtool.c | 1 + drivers/net/ethernet/emulex/benet/be_hw.h | 1 + drivers/net/ethernet/emulex/benet/be_main.c | 108 ++++++++++++++----------- 4 files changed, 69 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 8984c4938881..382891f81e09 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -248,6 +248,7 @@ struct be_tx_stats { u32 tx_spoof_check_err; u32 tx_qinq_err; u32 tx_internal_parity_err; + u32 tx_sge_err; struct u64_stats_sync sync; struct u64_stats_sync sync_compl; }; @@ -944,8 +945,10 @@ static inline bool is_ipv6_ext_hdr(struct sk_buff *skb) #define BE_ERROR_EEH 1 #define BE_ERROR_UE BIT(1) #define BE_ERROR_FW BIT(2) -#define BE_ERROR_HW (BE_ERROR_EEH | BE_ERROR_UE) -#define BE_ERROR_ANY (BE_ERROR_EEH | BE_ERROR_UE | BE_ERROR_FW) +#define BE_ERROR_TX BIT(3) +#define BE_ERROR_HW (BE_ERROR_EEH | BE_ERROR_UE | BE_ERROR_TX) +#define BE_ERROR_ANY (BE_ERROR_EEH | BE_ERROR_UE | BE_ERROR_FW | \ + BE_ERROR_TX) #define BE_CLEAR_ALL 0xFF static inline u8 be_check_error(struct be_adapter *adapter, u32 err_type) diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index 7d1819c9e8cc..7f7e206f95f8 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -189,6 +189,7 @@ static const struct be_ethtool_stat et_tx_stats[] = { * packet data. This counter is applicable only for Lancer adapters. */ {DRVSTAT_TX_INFO(tx_internal_parity_err)}, + {DRVSTAT_TX_INFO(tx_sge_err)}, {DRVSTAT_TX_INFO(tx_bytes)}, {DRVSTAT_TX_INFO(tx_pkts)}, {DRVSTAT_TX_INFO(tx_vxlan_offload_pkts)}, diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h index c967f45705d9..db5f92fb87e0 100644 --- a/drivers/net/ethernet/emulex/benet/be_hw.h +++ b/drivers/net/ethernet/emulex/benet/be_hw.h @@ -261,6 +261,7 @@ struct be_eth_hdr_wrb { #define LANCER_TX_COMP_HSW_DROP_MAC_ERR 0x3 #define LANCER_TX_COMP_HSW_DROP_VLAN_ERR 0x5 #define LANCER_TX_COMP_QINQ_ERR 0x7 +#define LANCER_TX_COMP_SGE_ERR 0x9 #define LANCER_TX_COMP_PARITY_ERR 0xb #define LANCER_TX_COMP_DMA_ERR 0xd diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 286d591c574e..5774fb6f8aa0 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2583,7 +2583,48 @@ static void be_post_rx_frags(struct be_rx_obj *rxo, gfp_t gfp, u32 frags_needed) } } -static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo) +static inline void be_update_tx_err(struct be_tx_obj *txo, u8 status) +{ + switch (status) { + case BE_TX_COMP_HDR_PARSE_ERR: + tx_stats(txo)->tx_hdr_parse_err++; + break; + case BE_TX_COMP_NDMA_ERR: + tx_stats(txo)->tx_dma_err++; + break; + case BE_TX_COMP_ACL_ERR: + tx_stats(txo)->tx_spoof_check_err++; + break; + } +} + +static inline void lancer_update_tx_err(struct be_tx_obj *txo, u8 status) +{ + switch (status) { + case LANCER_TX_COMP_LSO_ERR: + tx_stats(txo)->tx_tso_err++; + break; + case LANCER_TX_COMP_HSW_DROP_MAC_ERR: + case LANCER_TX_COMP_HSW_DROP_VLAN_ERR: + tx_stats(txo)->tx_spoof_check_err++; + break; + case LANCER_TX_COMP_QINQ_ERR: + tx_stats(txo)->tx_qinq_err++; + break; + case LANCER_TX_COMP_PARITY_ERR: + tx_stats(txo)->tx_internal_parity_err++; + break; + case LANCER_TX_COMP_DMA_ERR: + tx_stats(txo)->tx_dma_err++; + break; + case LANCER_TX_COMP_SGE_ERR: + tx_stats(txo)->tx_sge_err++; + break; + } +} + +static struct be_tx_compl_info *be_tx_compl_get(struct be_adapter *adapter, + struct be_tx_obj *txo) { struct be_queue_info *tx_cq = &txo->cq; struct be_tx_compl_info *txcp = &txo->txcp; @@ -2599,6 +2640,24 @@ static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo) txcp->status = GET_TX_COMPL_BITS(status, compl); txcp->end_index = GET_TX_COMPL_BITS(wrb_index, compl); + if (txcp->status) { + if (lancer_chip(adapter)) { + lancer_update_tx_err(txo, txcp->status); + /* Reset the adapter incase of TSO, + * SGE or Parity error + */ + if (txcp->status == LANCER_TX_COMP_LSO_ERR || + txcp->status == LANCER_TX_COMP_PARITY_ERR || + txcp->status == LANCER_TX_COMP_SGE_ERR) + be_set_error(adapter, BE_ERROR_TX); + } else { + be_update_tx_err(txo, txcp->status); + } + } + + if (be_check_error(adapter, BE_ERROR_TX)) + return NULL; + compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0; queue_tail_inc(tx_cq); return txcp; @@ -2741,7 +2800,7 @@ static void be_tx_compl_clean(struct be_adapter *adapter) cmpl = 0; num_wrbs = 0; txq = &txo->q; - while ((txcp = be_tx_compl_get(txo))) { + while ((txcp = be_tx_compl_get(adapter, txo))) { num_wrbs += be_tx_compl_process(adapter, txo, txcp->end_index); @@ -3120,42 +3179,6 @@ loop_continue: return work_done; } -static inline void be_update_tx_err(struct be_tx_obj *txo, u8 status) -{ - switch (status) { - case BE_TX_COMP_HDR_PARSE_ERR: - tx_stats(txo)->tx_hdr_parse_err++; - break; - case BE_TX_COMP_NDMA_ERR: - tx_stats(txo)->tx_dma_err++; - break; - case BE_TX_COMP_ACL_ERR: - tx_stats(txo)->tx_spoof_check_err++; - break; - } -} - -static inline void lancer_update_tx_err(struct be_tx_obj *txo, u8 status) -{ - switch (status) { - case LANCER_TX_COMP_LSO_ERR: - tx_stats(txo)->tx_tso_err++; - break; - case LANCER_TX_COMP_HSW_DROP_MAC_ERR: - case LANCER_TX_COMP_HSW_DROP_VLAN_ERR: - tx_stats(txo)->tx_spoof_check_err++; - break; - case LANCER_TX_COMP_QINQ_ERR: - tx_stats(txo)->tx_qinq_err++; - break; - case LANCER_TX_COMP_PARITY_ERR: - tx_stats(txo)->tx_internal_parity_err++; - break; - case LANCER_TX_COMP_DMA_ERR: - tx_stats(txo)->tx_dma_err++; - break; - } -} static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo, int idx) @@ -3163,16 +3186,9 @@ static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo, int num_wrbs = 0, work_done = 0; struct be_tx_compl_info *txcp; - while ((txcp = be_tx_compl_get(txo))) { + while ((txcp = be_tx_compl_get(adapter, txo))) { num_wrbs += be_tx_compl_process(adapter, txo, txcp->end_index); work_done++; - - if (txcp->status) { - if (lancer_chip(adapter)) - lancer_update_tx_err(txo, txcp->status); - else - be_update_tx_err(txo, txcp->status); - } } if (work_done) { -- cgit