diff options
Diffstat (limited to 'samples')
336 files changed, 27331 insertions, 9504 deletions
diff --git a/samples/Kconfig b/samples/Kconfig index 9cb63188d3ef..5bc7c9e5a59e 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menuconfig SAMPLES bool "Sample kernel code" help @@ -5,22 +6,62 @@ menuconfig SAMPLES if SAMPLES +config SAMPLE_AUXDISPLAY + bool "auxdisplay sample" + depends on CC_CAN_LINK + config SAMPLE_TRACE_EVENTS tristate "Build trace_events examples -- loadable modules only" depends on EVENT_TRACING && m help - This build trace event example modules. + This builds the trace event example module. + +config SAMPLE_TRACE_CUSTOM_EVENTS + tristate "Build custom trace event example -- loadable modules only" + depends on EVENT_TRACING && m + help + This builds the custom trace event example module. config SAMPLE_TRACE_PRINTK - tristate "Build trace_printk module - tests various trace_printk formats" + tristate "Build trace_printk module - tests various trace_printk formats" depends on EVENT_TRACING && m help - This builds a module that calls trace_printk() and can be used to - test various trace_printk() calls from a module. + This builds a module that calls trace_printk() and can be used to + test various trace_printk() calls from a module. + +config SAMPLE_FTRACE_DIRECT + tristate "Build register_ftrace_direct() example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on HAVE_SAMPLE_FTRACE_DIRECT + help + This builds an ftrace direct function example + that hooks to wake_up_process and prints the parameters. + +config SAMPLE_FTRACE_DIRECT_MULTI + tristate "Build register_ftrace_direct() on multiple ips example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on HAVE_SAMPLE_FTRACE_DIRECT_MULTI + help + This builds an ftrace direct function example + that hooks to wake_up_process and schedule, and prints + the function addresses. + +config SAMPLE_FTRACE_OPS + tristate "Build custom ftrace ops example" + depends on FUNCTION_TRACER + help + This builds an ftrace ops example that hooks two functions and + measures the time taken to invoke one function a number of times. + +config SAMPLE_TRACE_ARRAY + tristate "Build sample module for kernel access to Ftrace instances" + depends on EVENT_TRACING && m + help + This builds a module that demonstrates the use of various APIs to + access Ftrace instances from within the kernel. config SAMPLE_KOBJECT - tristate "Build kobject examples -- loadable modules only" - depends on m + tristate "Build kobject examples" help This config option will allow you to build a number of different kobject sample modules showing how to use kobjects, @@ -45,6 +86,13 @@ config SAMPLE_HW_BREAKPOINT help This builds kernel hardware breakpoint example modules. +config SAMPLE_FPROBE + tristate "Build fprobe examples -- loadable modules only" + depends on FPROBE && m + help + This builds a fprobe example module. This module has an option 'symbol'. + You can specify a probed symbol or symbols separated with ','. + config SAMPLE_KFIFO tristate "Build kfifo examples -- loadable modules only" depends on m @@ -62,6 +110,16 @@ config SAMPLE_KDB Build an example of how to dynamically add the hello command to the kdb shell. +config SAMPLE_QMI_CLIENT + tristate "Build qmi client sample -- loadable modules only" + depends on m + depends on ARCH_QCOM + depends on NET + select QCOM_QMI_HELPERS + help + Build an QMI client sample driver, which demonstrates how to + communicate with a remote QRTR service, using QMI encoded messages. + config SAMPLE_RPMSG_CLIENT tristate "Build rpmsg client sample -- loadable modules only" depends on RPMSG && m @@ -71,11 +129,10 @@ config SAMPLE_RPMSG_CLIENT the rpmsg bus. config SAMPLE_LIVEPATCH - tristate "Build live patching sample -- loadable modules only" + tristate "Build live patching samples -- loadable modules only" depends on LIVEPATCH && m help - Builds a sample live patch that replaces the procfs handler - for /proc/cmdline to print "this has been live patched". + Build sample live patch demonstrations. config SAMPLE_CONFIGFS tristate "Build configfs patching sample -- loadable modules only" @@ -85,37 +142,192 @@ config SAMPLE_CONFIGFS config SAMPLE_CONNECTOR tristate "Build connector sample -- loadable modules only" - depends on CONNECTOR && m + depends on CONNECTOR && HEADERS_INSTALL && m help When enabled, this builds both a sample kernel module for the connector interface and a user space tool to communicate with it. - See also Documentation/connector/connector.txt + See also Documentation/driver-api/connector.rst + +config SAMPLE_FANOTIFY_ERROR + bool "Build fanotify error monitoring sample" + depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL + help + When enabled, this builds an example code that uses the + FAN_FS_ERROR fanotify mechanism to monitor filesystem + errors. + See also Documentation/admin-guide/filesystem-monitoring.rst. + +config SAMPLE_HIDRAW + bool "hidraw sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_LANDLOCK + bool "Landlock example" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build a simple Landlock sandbox manager able to start a process + restricted by a user-defined filesystem access control policy. + +config SAMPLE_PIDFD + bool "pidfd sample" + depends on CC_CAN_LINK && HEADERS_INSTALL config SAMPLE_SECCOMP - tristate "Build seccomp sample code -- loadable modules only" - depends on SECCOMP_FILTER && m + bool "Build seccomp sample code" + depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL help Build samples of seccomp filters using various methods of BPF filter construction. -config SAMPLE_BLACKFIN_GPTIMERS - tristate "Build blackfin gptimers sample code -- loadable modules only" - depends on BLACKFIN && BFIN_GPTIMERS && m +config SAMPLE_TIMER + bool "Timer sample" + depends on CC_CAN_LINK && HEADERS_INSTALL + +config SAMPLE_TSM_MR + tristate "TSM measurement sample" + select TSM_MEASUREMENTS + select VIRT_DRIVERS + help + Build a sample module that emulates MRs (Measurement Registers) and + exposes them to user mode applications through the TSM sysfs + interface (/sys/class/misc/tsm_mr_sample/emulated_mr/). + + The module name will be tsm-mr-sample when built as a module. + +config SAMPLE_UHID + bool "UHID sample" + depends on CC_CAN_LINK && HEADERS_INSTALL help - Build samples of blackfin gptimers sample module. + Build UHID sample program. config SAMPLE_VFIO_MDEV_MTTY - tristate "Build VFIO mtty example mediated device sample code -- loadable modules only" - depends on VFIO_MDEV_DEVICE && m + tristate "Build VFIO mtty example mediated device sample code" + depends on VFIO + select VFIO_MDEV help Build a virtual tty sample driver for use as a VFIO mediated device -config SAMPLE_STATX - bool "Build example extended-stat using code" - depends on BROKEN +config SAMPLE_VFIO_MDEV_MDPY + tristate "Build VFIO mdpy example mediated device sample code" + depends on VFIO + select VFIO_MDEV help - Build example userspace program to use the new extended-stat syscall. + Build a virtual display sample driver for use as a VFIO + mediated device. It is a simple framebuffer and supports + the region display interface (VFIO_GFX_PLANE_TYPE_REGION). + +config SAMPLE_VFIO_MDEV_MDPY_FB + tristate "Build VFIO mdpy example guest fbdev driver" + depends on FB + select FB_IOMEM_HELPERS + help + Guest fbdev driver for the virtual display sample driver. + +config SAMPLE_VFIO_MDEV_MBOCHS + tristate "Build VFIO mbochs example mediated device sample code" + depends on VFIO + select VFIO_MDEV + select DMA_SHARED_BUFFER + help + Build a virtual display sample driver for use as a VFIO + mediated device. It supports the region display interface + (VFIO_GFX_PLANE_TYPE_DMABUF). + Emulate enough of qemu stdvga to make bochs-drm.ko happy. + That is basically the vram memory bar and the bochs dispi + interface vbe registers in the mmio register bar. + Specifically it does *not* include any legacy vga stuff. + Device looks a lot like "qemu -device secondary-vga". + +config SAMPLE_ANDROID_BINDERFS + bool "Build Android binderfs example" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Builds a sample program to illustrate the use of the Android binderfs + filesystem. + +config SAMPLE_VFS + bool "Build example programs that use new VFS system calls" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build example userspace programs that use new VFS system calls such + as mount API and statx(). Note that this is restricted to the x86 + arch whilst it accesses system calls that aren't yet in all arches. + +config SAMPLE_INTEL_MEI + bool "Build example program working with intel mei driver" + depends on INTEL_MEI + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build a sample program to work with mei device. + +config SAMPLE_TPS6594_PFSM + bool "Build example program working with TPS6594 PFSM driver" + depends on HEADERS_INSTALL + depends on CC_CAN_LINK + help + Build a sample program to work with PFSM devices. + +config SAMPLE_WATCHDOG + bool "watchdog sample" + depends on CC_CAN_LINK + +config SAMPLE_WATCH_QUEUE + bool "Build example watch_queue notification API consumer" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build example userspace program to use the new mount_notify(), + sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. + +config SAMPLE_CORESIGHT_SYSCFG + tristate "Build example loadable module for CoreSight config" + depends on CORESIGHT && m + help + Build an example loadable module that adds new CoreSight features + and configuration using the CoreSight system configuration API. + This demonstrates how a user may create their own CoreSight + configurations and easily load them into the system at runtime. + +config SAMPLE_KMEMLEAK + tristate "Simple test for the kernel memory leak detector" + depends on DEBUG_KMEMLEAK && m + help + Build a sample program which have explicitly leaks memory to test + kmemleak. + +config SAMPLE_CGROUP + bool "Build cgroup sample code" + depends on CGROUPS && CC_CAN_LINK && HEADERS_INSTALL + help + Build samples that demonstrate the usage of the cgroup API. + +config SAMPLE_CHECK_EXEC + bool "Exec secure bits examples" + depends on CC_CAN_LINK && HEADERS_INSTALL + help + Build a tool to easily configure SECBIT_EXEC_RESTRICT_FILE and + SECBIT_EXEC_DENY_INTERACTIVE, and a simple script interpreter to + demonstrate how they should be used with execveat(2) + + AT_EXECVE_CHECK. + +config SAMPLE_HUNG_TASK + tristate "Hung task detector test code" + depends on DETECT_HUNG_TASK && DEBUG_FS + help + Build a module that provides debugfs files (e.g., mutex, semaphore, + rw_semaphore_read, rw_semaphore_write) under <debugfs>/hung_task. + Reading these files with multiple processes triggers hung task + detection by holding locks for a long time (256 seconds). + +source "samples/rust/Kconfig" + +source "samples/damon/Kconfig" endif # SAMPLES + +config HAVE_SAMPLE_FTRACE_DIRECT + bool + +config HAVE_SAMPLE_FTRACE_DIRECT_MULTI + bool diff --git a/samples/Makefile b/samples/Makefile index db54e766ddb1..07641e177bd8 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,6 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ - hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \ - configfs/ connector/ v4l/ trace_printk/ blackfin/ \ - vfio-mdev/ statx/ +subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay +subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs +subdir-$(CONFIG_SAMPLE_CHECK_EXEC) += check-exec +subdir-$(CONFIG_SAMPLE_CGROUP) += cgroup +obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ +obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ +obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ +subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw +obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ +obj-$(CONFIG_SAMPLE_KDB) += kdb/ +obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/ +obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/ +obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/ +subdir-$(CONFIG_SAMPLE_LANDLOCK) += landlock +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/ +subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd +obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/ +obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ +subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp +subdir-$(CONFIG_SAMPLE_TIMER) += timers +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/ +obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace/ +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ +subdir-$(CONFIG_SAMPLE_UHID) += uhid +obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ +obj-y += vfio-mdev/ +subdir-$(CONFIG_SAMPLE_VFS) += vfs +obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ +obj-$(CONFIG_SAMPLE_TPS6594_PFSM) += pfsm/ +subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog +subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue +obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak/ +obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/ +obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/ +obj-$(CONFIG_SAMPLES_RUST) += rust/ +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += damon/ +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += damon/ +obj-$(CONFIG_SAMPLE_DAMON_MTIER) += damon/ +obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task/ +obj-$(CONFIG_SAMPLE_TSM_MR) += tsm-mr/ diff --git a/samples/acrn/Makefile b/samples/acrn/Makefile new file mode 100644 index 000000000000..c8e3ed9785e9 --- /dev/null +++ b/samples/acrn/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +.PHONY: vm-sample + +vm-sample: vm-sample.o payload.o + $(CC) $^ -o $@ + +payload.o: payload.ld guest16.o + $(LD) -T $< -o $@ + +clean: + rm *.o vm-sample diff --git a/samples/acrn/guest.ld b/samples/acrn/guest.ld new file mode 100644 index 000000000000..5127c682bd22 --- /dev/null +++ b/samples/acrn/guest.ld @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +OUTPUT_FORMAT(binary) +SECTIONS +{ + .start : { *(.start) } + .text : { *(.text*) } + .rodata : { *(.rodata) } + .data : { *(.data) } +} diff --git a/samples/acrn/payload.ld b/samples/acrn/payload.ld new file mode 100644 index 000000000000..e8d9a498ad62 --- /dev/null +++ b/samples/acrn/payload.ld @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +SECTIONS +{ + .payload16 0 : { + guest16 = .; + guest16.o(.text) + guest16_end = .; + } +} diff --git a/samples/acrn/vm-sample.c b/samples/acrn/vm-sample.c new file mode 100644 index 000000000000..c61e0f91456e --- /dev/null +++ b/samples/acrn/vm-sample.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A sample program to run a User VM on the ACRN hypervisor + * + * This sample runs in a Service VM, which is a privileged VM of ACRN. + * CONFIG_ACRN_HSM needs to be enabled in the Service VM. + * + * Guest VM code in guest16.s will be executed after the VM launched. + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + */ +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <signal.h> +#include <sys/ioctl.h> +#include <linux/acrn.h> + +#define GUEST_MEMORY_SIZE (1024*1024) +void *guest_memory; + +extern const unsigned char guest16[], guest16_end[]; +static char io_request_page[4096] __attribute__((aligned(4096))); +static struct acrn_io_request *io_req_buf = (struct acrn_io_request *)io_request_page; + +__u16 vcpu_num; +__u16 vmid; + +int hsm_fd; +int is_running = 1; + +void vm_exit(int sig) +{ + sig = sig; + + is_running = 0; + ioctl(hsm_fd, ACRN_IOCTL_PAUSE_VM, vmid); + ioctl(hsm_fd, ACRN_IOCTL_DESTROY_IOREQ_CLIENT, 0); +} + +int main(int argc, char **argv) +{ + int vcpu_id, ret; + struct acrn_vm_creation create_vm = {0}; + struct acrn_vm_memmap ram_map = {0}; + struct acrn_vcpu_regs regs; + struct acrn_io_request *io_req; + struct acrn_ioreq_notify __attribute__((aligned(8))) notify; + + argc = argc; + argv = argv; + + ret = posix_memalign(&guest_memory, 4096, GUEST_MEMORY_SIZE); + if (ret < 0) { + printf("Not enough memory!\n"); + return -1; + } + hsm_fd = open("/dev/acrn_hsm", O_RDWR|O_CLOEXEC); + + create_vm.ioreq_buf = (__u64)io_req_buf; + ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_VM, &create_vm); + printf("Created VM! [%d]\n", ret); + vcpu_num = create_vm.vcpu_num; + vmid = create_vm.vmid; + + /* setup guest memory */ + ram_map.type = ACRN_MEMMAP_RAM; + ram_map.vma_base = (__u64)guest_memory; + ram_map.len = GUEST_MEMORY_SIZE; + ram_map.user_vm_pa = 0; + ram_map.attr = ACRN_MEM_ACCESS_RWX; + ret = ioctl(hsm_fd, ACRN_IOCTL_SET_MEMSEG, &ram_map); + printf("Set up VM memory! [%d]\n", ret); + + memcpy(guest_memory, guest16, guest16_end-guest16); + + /* setup vcpu registers */ + memset(®s, 0, sizeof(regs)); + regs.vcpu_id = 0; + regs.vcpu_regs.rip = 0; + + /* CR0_ET | CR0_NE */ + regs.vcpu_regs.cr0 = 0x30U; + regs.vcpu_regs.cs_ar = 0x009FU; + regs.vcpu_regs.cs_sel = 0xF000U; + regs.vcpu_regs.cs_limit = 0xFFFFU; + regs.vcpu_regs.cs_base = 0 & 0xFFFF0000UL; + regs.vcpu_regs.rip = 0 & 0xFFFFUL; + + ret = ioctl(hsm_fd, ACRN_IOCTL_SET_VCPU_REGS, ®s); + printf("Set up VM BSP registers! [%d]\n", ret); + + /* create an ioreq client for this VM */ + ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_IOREQ_CLIENT, 0); + printf("Created IO request client! [%d]\n", ret); + + /* run vm */ + ret = ioctl(hsm_fd, ACRN_IOCTL_START_VM, vmid); + printf("Start VM! [%d]\n", ret); + + signal(SIGINT, vm_exit); + while (is_running) { + ret = ioctl(hsm_fd, ACRN_IOCTL_ATTACH_IOREQ_CLIENT, 0); + + for (vcpu_id = 0; vcpu_id < vcpu_num; vcpu_id++) { + io_req = &io_req_buf[vcpu_id]; + if ((__sync_add_and_fetch(&io_req->processed, 0) == ACRN_IOREQ_STATE_PROCESSING) + && (!io_req->kernel_handled)) + if (io_req->type == ACRN_IOREQ_TYPE_PORTIO) { + int bytes, port, in; + + port = io_req->reqs.pio_request.address; + bytes = io_req->reqs.pio_request.size; + in = (io_req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ); + printf("Guest VM %s PIO[%x] with size[%x]\n", in ? "read" : "write", port, bytes); + + notify.vmid = vmid; + notify.vcpu = vcpu_id; + ioctl(hsm_fd, ACRN_IOCTL_NOTIFY_REQUEST_FINISH, ¬ify); + } + } + } + + ret = ioctl(hsm_fd, ACRN_IOCTL_DESTROY_VM, NULL); + printf("Destroy VM! [%d]\n", ret); + close(hsm_fd); + free(guest_memory); + return 0; +} diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore index 7af222860a96..d023816849bd 100644 --- a/samples/auxdisplay/.gitignore +++ b/samples/auxdisplay/.gitignore @@ -1 +1,2 @@ -cfag12864b-example +# SPDX-License-Identifier: GPL-2.0-only +/cfag12864b-example diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile index 05e471feb6e5..19d5568938c3 100644 --- a/samples/auxdisplay/Makefile +++ b/samples/auxdisplay/Makefile @@ -1,9 +1,2 @@ -CC := $(CROSS_COMPILE)gcc -CFLAGS := -I../../usr/include - -PROGS := cfag12864b-example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += cfag12864b-example diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c index e7823ffb1ca0..2e3bb7375c99 100644 --- a/samples/auxdisplay/cfag12864b-example.c +++ b/samples/auxdisplay/cfag12864b-example.c @@ -1,25 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Filename: cfag12864b-example.c * Version: 0.1.0 * Description: cfag12864b LCD userspace example program - * License: GPLv2 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org> * Date: 2006-10-31 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * */ /* @@ -259,7 +245,7 @@ int main(int argc, char *argv[]) if (argc != 2) { printf( - "Sintax: %s fbdev\n" + "Syntax: %s fbdev\n" "Usually: /dev/fb0, /dev/fb1...\n", argv[0]); return -1; } diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore new file mode 100644 index 000000000000..8fa415a3640b --- /dev/null +++ b/samples/binderfs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +/binderfs_example diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile new file mode 100644 index 000000000000..629e43b9b129 --- /dev/null +++ b/samples/binderfs/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += binderfs_example + +userccflags += -I usr/include diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c new file mode 100644 index 000000000000..0fd92cdda460 --- /dev/null +++ b/samples/binderfs/binderfs_example.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <linux/android/binder.h> +#include <linux/android/binderfs.h> + +int main(int argc, char *argv[]) +{ + int fd, ret, saved_errno; + struct binderfs_device device = { 0 }; + + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + fprintf(stderr, "%s - Failed to unshare mount namespace\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (ret < 0) { + fprintf(stderr, "%s - Failed to mount / as private\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mkdir("/dev/binderfs", 0755); + if (ret < 0 && errno != EEXIST) { + fprintf(stderr, "%s - Failed to create binderfs mountpoint\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = mount(NULL, "/dev/binderfs", "binder", 0, 0); + if (ret < 0) { + fprintf(stderr, "%s - Failed to mount binderfs\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + memcpy(device.name, "my-binder", strlen("my-binder")); + + fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "%s - Failed to open binder-control device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = ioctl(fd, BINDER_CTL_ADD, &device); + saved_errno = errno; + close(fd); + errno = saved_errno; + if (ret < 0) { + fprintf(stderr, "%s - Failed to allocate new binder device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + printf("Allocated new binder device with major %d, minor %d, and name %s\n", + device.major, device.minor, device.name); + + ret = unlink("/dev/binderfs/my-binder"); + if (ret < 0) { + fprintf(stderr, "%s - Failed to delete binder device\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Cleanup happens when the mount namespace dies. */ + exit(EXIT_SUCCESS); +} diff --git a/samples/blackfin/Makefile b/samples/blackfin/Makefile deleted file mode 100644 index 89b86cfd83a2..000000000000 --- a/samples/blackfin/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_SAMPLE_BLACKFIN_GPTIMERS) += gptimers-example.o diff --git a/samples/blackfin/gptimers-example.c b/samples/blackfin/gptimers-example.c deleted file mode 100644 index 283eba993d9d..000000000000 --- a/samples/blackfin/gptimers-example.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Simple gptimers example - * http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:gptimers - * - * Copyright 2007-2009 Analog Devices Inc. - * - * Licensed under the GPL-2 or later. - */ - -#include <linux/interrupt.h> -#include <linux/module.h> - -#include <asm/gptimers.h> -#include <asm/portmux.h> - -/* ... random driver includes ... */ - -#define DRIVER_NAME "gptimer_example" - -#ifdef IRQ_TIMER5 -#define SAMPLE_IRQ_TIMER IRQ_TIMER5 -#else -#define SAMPLE_IRQ_TIMER IRQ_TIMER2 -#endif - -struct gptimer_data { - uint32_t period, width; -}; -static struct gptimer_data data; - -/* ... random driver state ... */ - -static irqreturn_t gptimer_example_irq(int irq, void *dev_id) -{ - struct gptimer_data *data = dev_id; - - /* make sure it was our timer which caused the interrupt */ - if (!get_gptimer_intr(TIMER5_id)) - return IRQ_NONE; - - /* read the width/period values that were captured for the waveform */ - data->width = get_gptimer_pwidth(TIMER5_id); - data->period = get_gptimer_period(TIMER5_id); - - /* acknowledge the interrupt */ - clear_gptimer_intr(TIMER5_id); - - /* tell the upper layers we took care of things */ - return IRQ_HANDLED; -} - -/* ... random driver code ... */ - -static int __init gptimer_example_init(void) -{ - int ret; - - /* grab the peripheral pins */ - ret = peripheral_request(P_TMR5, DRIVER_NAME); - if (ret) { - printk(KERN_NOTICE DRIVER_NAME ": peripheral request failed\n"); - return ret; - } - - /* grab the IRQ for the timer */ - ret = request_irq(SAMPLE_IRQ_TIMER, gptimer_example_irq, - IRQF_SHARED, DRIVER_NAME, &data); - if (ret) { - printk(KERN_NOTICE DRIVER_NAME ": IRQ request failed\n"); - peripheral_free(P_TMR5); - return ret; - } - - /* setup the timer and enable it */ - set_gptimer_config(TIMER5_id, - WDTH_CAP | PULSE_HI | PERIOD_CNT | IRQ_ENA); - enable_gptimers(TIMER5bit); - - return 0; -} -module_init(gptimer_example_init); - -static void __exit gptimer_example_exit(void) -{ - disable_gptimers(TIMER5bit); - free_irq(SAMPLE_IRQ_TIMER, &data); - peripheral_free(P_TMR5); -} -module_exit(gptimer_example_exit); - -MODULE_LICENSE("BSD"); diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore new file mode 100644 index 000000000000..0002cd359fb1 --- /dev/null +++ b/samples/bpf/.gitignore @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: GPL-2.0-only +cpustat +fds_example +hbm +ibumad +lathist +lwt_len_hist +map_perf_test +offwaketime +per_socket_stats_example +sampleip +sock_example +sockex1 +sockex2 +sockex3 +spintest +syscall_nrs.h +syscall_tp +task_fd_query +tc_l2_redirect +test_cgrp2_array_pin +test_cgrp2_attach +test_cgrp2_attach2 +test_cgrp2_sock +test_cgrp2_sock2 +test_current_task_under_cgroup +test_lru_dist +test_map_in_map +test_overhead +test_probe_write_user +trace_event +trace_output +tracex1 +tracex2 +tracex3 +tracex4 +tracex5 +tracex6 +tracex7 +xdp_adjust_tail +xdp_fwd +xdp_router_ipv4 +xdp_tx_iptunnel +testfile.img +hbm_out.log +iperf.* +*.out +*.skel.h +/vmlinux.h +/bpftool/ +/libbpf/ diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 87246be6feb8..95a4fa1f1e44 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -1,180 +1,256 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o +# SPDX-License-Identifier: GPL-2.0 + +BPF_SAMPLES_PATH ?= $(abspath $(src)) +TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools + +pound := \# # List of programs to build -hostprogs-y := test_lru_dist -hostprogs-y += sock_example -hostprogs-y += fds_example -hostprogs-y += sockex1 -hostprogs-y += sockex2 -hostprogs-y += sockex3 -hostprogs-y += tracex1 -hostprogs-y += tracex2 -hostprogs-y += tracex3 -hostprogs-y += tracex4 -hostprogs-y += tracex5 -hostprogs-y += tracex6 -hostprogs-y += test_probe_write_user -hostprogs-y += trace_output -hostprogs-y += lathist -hostprogs-y += offwaketime -hostprogs-y += spintest -hostprogs-y += map_perf_test -hostprogs-y += test_overhead -hostprogs-y += test_cgrp2_array_pin -hostprogs-y += test_cgrp2_attach -hostprogs-y += test_cgrp2_attach2 -hostprogs-y += test_cgrp2_sock -hostprogs-y += test_cgrp2_sock2 -hostprogs-y += xdp1 -hostprogs-y += xdp2 -hostprogs-y += test_current_task_under_cgroup -hostprogs-y += trace_event -hostprogs-y += sampleip -hostprogs-y += tc_l2_redirect -hostprogs-y += lwt_len_hist -hostprogs-y += xdp_tx_iptunnel -hostprogs-y += test_map_in_map -hostprogs-y += per_socket_stats_example -hostprogs-y += load_sock_ops +tprogs-y := test_lru_dist +tprogs-y += sock_example +tprogs-y += fds_example +tprogs-y += sockex1 +tprogs-y += sockex2 +tprogs-y += sockex3 +tprogs-y += tracex1 +tprogs-y += tracex3 +tprogs-y += tracex4 +tprogs-y += tracex5 +tprogs-y += tracex6 +tprogs-y += trace_output +tprogs-y += lathist +tprogs-y += offwaketime +tprogs-y += spintest +tprogs-y += map_perf_test +tprogs-y += xdp_router_ipv4 +tprogs-y += trace_event +tprogs-y += sampleip +tprogs-y += tc_l2_redirect +tprogs-y += lwt_len_hist +tprogs-y += xdp_tx_iptunnel +tprogs-y += test_map_in_map +tprogs-y += per_socket_stats_example +tprogs-y += syscall_tp +tprogs-y += cpustat +tprogs-y += xdp_adjust_tail +tprogs-y += xdp_fwd +tprogs-y += task_fd_query +tprogs-y += ibumad +tprogs-y += hbm # Libbpf dependencies -LIBBPF := ../../tools/lib/bpf/bpf.o - -test_lru_dist-objs := test_lru_dist.o $(LIBBPF) -sock_example-objs := sock_example.o $(LIBBPF) -fds_example-objs := bpf_load.o $(LIBBPF) fds_example.o -sockex1-objs := bpf_load.o $(LIBBPF) sockex1_user.o -sockex2-objs := bpf_load.o $(LIBBPF) sockex2_user.o -sockex3-objs := bpf_load.o $(LIBBPF) sockex3_user.o -tracex1-objs := bpf_load.o $(LIBBPF) tracex1_user.o -tracex2-objs := bpf_load.o $(LIBBPF) tracex2_user.o -tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o -tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o -tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o -tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o -load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o -test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o -trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o -lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o -offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o -spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o -map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o -test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o -test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o -test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o -test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o cgroup_helpers.o -test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o -test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o -xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o -# reuse xdp1 source intentionally -xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o -test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \ - test_current_task_under_cgroup_user.o -trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o -sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o -tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o -lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o -xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o -test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o -per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o +LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf +LIBBPF_OUTPUT = $(abspath $(BPF_SAMPLES_PATH))/libbpf +LIBBPF_DESTDIR = $(LIBBPF_OUTPUT) +LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include +LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a + +CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o +TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o +XDP_SAMPLE := xdp_sample_user.o + +fds_example-objs := fds_example.o +sockex1-objs := sockex1_user.o +sockex2-objs := sockex2_user.o +sockex3-objs := sockex3_user.o +tracex1-objs := tracex1_user.o $(TRACE_HELPERS) +tracex3-objs := tracex3_user.o +tracex4-objs := tracex4_user.o +tracex5-objs := tracex5_user.o $(TRACE_HELPERS) +tracex6-objs := tracex6_user.o +trace_output-objs := trace_output_user.o +lathist-objs := lathist_user.o +offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) +spintest-objs := spintest_user.o $(TRACE_HELPERS) +map_perf_test-objs := map_perf_test_user.o +test_overhead-objs := test_overhead_user.o +trace_event-objs := trace_event_user.o $(TRACE_HELPERS) +sampleip-objs := sampleip_user.o $(TRACE_HELPERS) +tc_l2_redirect-objs := tc_l2_redirect_user.o +lwt_len_hist-objs := lwt_len_hist_user.o +xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o +test_map_in_map-objs := test_map_in_map_user.o +per_socket_stats_example-objs := cookie_uid_helper_example.o +syscall_tp-objs := syscall_tp_user.o +cpustat-objs := cpustat_user.o +xdp_adjust_tail-objs := xdp_adjust_tail_user.o +xdp_fwd-objs := xdp_fwd_user.o +task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) +ibumad-objs := ibumad_user.o +hbm-objs := hbm.o $(CGROUP_HELPERS) + +xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE) # Tell kbuild to always build the programs -always := $(hostprogs-y) -always += sockex1_kern.o -always += sockex2_kern.o -always += sockex3_kern.o -always += tracex1_kern.o -always += tracex2_kern.o -always += tracex3_kern.o -always += tracex4_kern.o -always += tracex5_kern.o -always += tracex6_kern.o -always += sock_flags_kern.o -always += test_probe_write_user_kern.o -always += trace_output_kern.o -always += tcbpf1_kern.o -always += tcbpf2_kern.o -always += tc_l2_redirect_kern.o -always += lathist_kern.o -always += offwaketime_kern.o -always += spintest_kern.o -always += map_perf_test_kern.o -always += test_overhead_tp_kern.o -always += test_overhead_kprobe_kern.o -always += parse_varlen.o parse_simple.o parse_ldabs.o -always += test_cgrp2_tc_kern.o -always += xdp1_kern.o -always += xdp2_kern.o -always += test_current_task_under_cgroup_kern.o -always += trace_event_kern.o -always += sampleip_kern.o -always += lwt_len_hist_kern.o -always += xdp_tx_iptunnel_kern.o -always += test_map_in_map_kern.o -always += cookie_uid_helper_example.o -always += tcp_synrto_kern.o -always += tcp_rwnd_kern.o -always += tcp_bufs_kern.o -always += tcp_cong_kern.o -always += tcp_iw_kern.o -always += tcp_clamp_kern.o - -HOSTCFLAGS += -I$(objtree)/usr/include -HOSTCFLAGS += -I$(srctree)/tools/lib/ -HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ -HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include -HOSTCFLAGS += -I$(srctree)/tools/perf - -HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable -HOSTLOADLIBES_fds_example += -lelf -HOSTLOADLIBES_sockex1 += -lelf -HOSTLOADLIBES_sockex2 += -lelf -HOSTLOADLIBES_sockex3 += -lelf -HOSTLOADLIBES_tracex1 += -lelf -HOSTLOADLIBES_tracex2 += -lelf -HOSTLOADLIBES_tracex3 += -lelf -HOSTLOADLIBES_tracex4 += -lelf -lrt -HOSTLOADLIBES_tracex5 += -lelf -HOSTLOADLIBES_tracex6 += -lelf -HOSTLOADLIBES_test_cgrp2_sock2 += -lelf -HOSTLOADLIBES_load_sock_ops += -lelf -HOSTLOADLIBES_test_probe_write_user += -lelf -HOSTLOADLIBES_trace_output += -lelf -lrt -HOSTLOADLIBES_lathist += -lelf -HOSTLOADLIBES_offwaketime += -lelf -HOSTLOADLIBES_spintest += -lelf -HOSTLOADLIBES_map_perf_test += -lelf -lrt -HOSTLOADLIBES_test_overhead += -lelf -lrt -HOSTLOADLIBES_xdp1 += -lelf -HOSTLOADLIBES_xdp2 += -lelf -HOSTLOADLIBES_test_current_task_under_cgroup += -lelf -HOSTLOADLIBES_trace_event += -lelf -HOSTLOADLIBES_sampleip += -lelf -HOSTLOADLIBES_tc_l2_redirect += -l elf -HOSTLOADLIBES_lwt_len_hist += -l elf -HOSTLOADLIBES_xdp_tx_iptunnel += -lelf -HOSTLOADLIBES_test_map_in_map += -lelf +always-y := $(tprogs-y) +always-y += sockex1_kern.o +always-y += sockex2_kern.o +always-y += sockex3_kern.o +always-y += tracex1.bpf.o +always-y += tracex3.bpf.o +always-y += tracex4.bpf.o +always-y += tracex5.bpf.o +always-y += tracex6.bpf.o +always-y += trace_output.bpf.o +always-y += tcbpf1_kern.o +always-y += tc_l2_redirect_kern.o +always-y += lathist_kern.o +always-y += offwaketime.bpf.o +always-y += spintest.bpf.o +always-y += map_perf_test.bpf.o +always-y += parse_varlen.o parse_simple.o parse_ldabs.o +always-y += trace_event_kern.o +always-y += sampleip_kern.o +always-y += lwt_len_hist.bpf.o +always-y += xdp_tx_iptunnel_kern.o +always-y += test_map_in_map.bpf.o +always-y += tcp_synrto_kern.o +always-y += tcp_rwnd_kern.o +always-y += tcp_bufs_kern.o +always-y += tcp_cong_kern.o +always-y += tcp_iw_kern.o +always-y += tcp_clamp_kern.o +always-y += tcp_basertt_kern.o +always-y += tcp_tos_reflect_kern.o +always-y += tcp_dumpstats_kern.o +always-y += xdp2skb_meta_kern.o +always-y += syscall_tp_kern.o +always-y += cpustat_kern.o +always-y += xdp_adjust_tail_kern.o +always-y += xdp_fwd_kern.o +always-y += task_fd_query_kern.o +always-y += ibumad_kern.o +always-y += hbm_out_kern.o +always-y += hbm_edt_kern.o + +COMMON_CFLAGS = $(TPROGS_USER_CFLAGS) +TPROGS_LDFLAGS = $(TPROGS_USER_LDFLAGS) + +ifeq ($(ARCH), arm) +# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux +# headers when arm instruction set identification is requested. +ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS)) +BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) +TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) +endif + +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +ifdef CONFIG_MACH_LOONGSON64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic +endif +endif + +ifeq ($(ARCH), x86) +BPF_EXTRA_CFLAGS += -fcf-protection +endif + +COMMON_CFLAGS += -Wall -O2 +COMMON_CFLAGS += -Wmissing-prototypes +COMMON_CFLAGS += -Wstrict-prototypes +COMMON_CFLAGS += $(call try-run,\ + printf "int main() { return 0; }" |\ + $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,) + +TPROGS_CFLAGS += $(COMMON_CFLAGS) +TPROGS_CFLAGS += -I$(objtree)/usr/include +TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ +TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +TPROGS_CFLAGS += -I$(srctree)/tools/include +TPROGS_CFLAGS += -I$(srctree)/tools/perf +TPROGS_CFLAGS += -I$(srctree)/tools/lib +TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0 + +ifdef SYSROOT +COMMON_CFLAGS += --sysroot=$(SYSROOT) +TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib +endif + +TPROGS_LDLIBS += $(LIBBPF) -lelf -lz +TPROGLDLIBS_xdp_router_ipv4 += -lm -pthread +TPROGLDLIBS_tracex4 += -lrt +TPROGLDLIBS_trace_output += -lrt +TPROGLDLIBS_map_perf_test += -lrt # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: -# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang LLC ?= llc CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis +LLVM_OBJCOPY ?= llvm-objcopy +LLVM_READELF ?= llvm-readelf +BTF_PAHOLE ?= pahole + +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif + +# Don't evaluate probes and warnings if we need to run make recursively +ifneq ($(src),) +HDR_PROBE := $(shell printf "$(pound)include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ + $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \ + -o /dev/null 2>/dev/null && echo okay) + +ifeq ($(HDR_PROBE),) +$(warning WARNING: Detected possible issues with include path.) +$(warning WARNING: Please install kernel headers locally (make headers_install).) +endif + +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') +BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ + $(CLANG) --target=bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \ + $(LLVM_READELF) -S ./llvm_btf_verify.o | grep BTF; \ + /bin/rm -f ./llvm_btf_verify.o) + +BPF_EXTRA_CFLAGS += -fno-stack-protector +ifneq ($(BTF_LLVM_PROBE),) + BPF_EXTRA_CFLAGS += -g +else +ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) + BPF_EXTRA_CFLAGS += -g + LLC_FLAGS += -mattr=dwarfris + DWARF2BTF = y +endif +endif +endif # Trick to allow make to be run from this directory all: - $(MAKE) -C ../../ $(CURDIR)/ + $(MAKE) -C ../../ M=$(CURDIR) BPF_SAMPLES_PATH=$(CURDIR) clean: $(MAKE) -C ../../ M=$(CURDIR) clean - @rm -f *~ + @find $(CURDIR) -type f -name '*~' -delete + @$(RM) -r $(CURDIR)/libbpf $(CURDIR)/bpftool + +$(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) +# Fix up variables inherited from Kbuild that tools/ build system won't like + $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers -$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c - $(call if_changed_dep,cc_s_c) +BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool +BPFTOOL_OUTPUT := $(abspath $(BPF_SAMPLES_PATH))/bpftool +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) | $(BPFTOOL_OUTPUT) + $(MAKE) -C $(BPFTOOLDIR) srctree=$(BPF_SAMPLES_PATH)/../../ \ + OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap + +$(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ $(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE $(call filechk,offsets,__SYSCALL_NRS_H__) +targets += syscall_nrs.s clean-files += syscall_nrs.h FORCE: @@ -198,19 +274,119 @@ verify_target_bpf: verify_cmds exit 2; \ else true; fi -$(src)/*.c: verify_target_bpf +$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) +$(src)/*.c: verify_target_bpf $(LIBBPF) + +libbpf_hdrs: $(LIBBPF) +$(obj)/$(TRACE_HELPERS) $(obj)/$(CGROUP_HELPERS) $(obj)/$(XDP_SAMPLE): | libbpf_hdrs + +.PHONY: libbpf_hdrs + +$(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h + +$(obj)/tracex5.bpf.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +$(obj)/hbm.o: $(src)/hbm.h +$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h + +# Override includes for xdp_sample_user.o because $(srctree)/usr/include in +# TPROGS_CFLAGS causes conflicts +XDP_SAMPLE_CFLAGS += -Wall -O2 \ + -I$(src)/../../tools/include \ + -I$(src)/../../tools/include/uapi \ + -I$(LIBBPF_INCLUDE) \ + -I$(src)/../../tools/testing/selftests/bpf + +$(obj)/$(XDP_SAMPLE): TPROGS_CFLAGS = $(XDP_SAMPLE_CFLAGS) $(TPROGS_USER_CFLAGS) +$(obj)/$(XDP_SAMPLE): $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h +# Override includes for trace_helpers.o because __must_check won't be defined +# in our include path. +$(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check= + +-include $(BPF_SAMPLES_PATH)/Makefile.target + +VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ + $(abspath $(objtree)/vmlinux) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +ifeq ($(VMLINUX_H),) +ifeq ($(VMLINUX_BTF),) + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ + build the kernel or set VMLINUX_BTF like "VMLINUX_BTF=/sys/kernel/btf/vmlinux" or VMLINUX_H variable) +endif + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +clean-files += vmlinux.h + +# Get Clang's default includes on this system, as opposed to those seen by +# '--target=bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/') +endef + +CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) + +$(obj)/xdp_router_ipv4.bpf.o: $(obj)/xdp_sample.bpf.o + +$(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h + @echo " CLANG-BPF " $@ + $(Q)$(CLANG) -g -O2 --target=bpf -D__TARGET_ARCH_$(SRCARCH) \ + -Wno-compare-distinct-pointer-types -I$(srctree)/include \ + -I$(srctree)/samples/bpf -I$(srctree)/tools/include \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $(filter %.bpf.c,$^) -o $@ + +LINKED_SKELS := xdp_router_ipv4.skel.h +clean-files += $(LINKED_SKELS) + +xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o + +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) + +BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c)) +BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED)) +BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS)) -$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL) + @echo " BPF GEN-OBJ " $(@:.skel.h=) + $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps)) + @echo " BPF GEN-SKEL" $(@:.skel.h=) + $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@ # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. $(obj)/%.o: $(src)/%.c - $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ - -I$(srctree)/tools/testing/selftests/bpf/ \ - -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -Wno-compare-distinct-pointer-types \ + @echo " CLANG-bpf " $@ + $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ + -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ + -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ - -Wno-unknown-warning-option \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ + -fno-asynchronous-unwind-tables \ + -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ +ifeq ($(DWARF2BTF),y) + $(BTF_PAHOLE) -J $@ +endif diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target new file mode 100644 index 000000000000..7621f55e2947 --- /dev/null +++ b/samples/bpf/Makefile.target @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# ========================================================================== +# Building binaries on the host system +# Binaries are not used during the compilation of the kernel, and intended +# to be build for target board, target board can be host of course. Added to +# build binaries to run not on host system. +# +# Sample syntax +# tprogs-y := xsk_example +# Will compile xsk_example.c and create an executable named xsk_example +# +# tprogs-y := xdpsock +# xdpsock-objs := xdpsock_1.o xdpsock_2.o +# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable +# xdpsock, based on xdpsock_1.o and xdpsock_2.o +# +# Derived from scripts/Makefile.host +# +__tprogs := $(sort $(tprogs-y)) + +# C code +# Executables compiled from a single .c file +tprog-csingle := $(foreach m,$(__tprogs), \ + $(if $($(m)-objs),,$(m))) + +# C executables linked based on several .o files +tprog-cmulti := $(foreach m,$(__tprogs),\ + $(if $($(m)-objs),$(m))) + +# Object (.o) files compiled from .c files +tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs))) + +tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle)) +tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti)) +tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs)) + +##### +# Handle options to gcc. Support building with separate output directory + +_tprogc_flags = $(TPROGS_CFLAGS) \ + $(TPROGCFLAGS_$(basetarget).o) + +# $(objtree)/$(obj) for including generated headers from checkin source files +ifeq ($(KBUILD_EXTMOD),) +ifdef building_out_of_srctree +_tprogc_flags += -I $(objtree)/$(obj) +endif +endif + +tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags) + +# Create executable from a single .c file +# tprog-csingle -> Executable +quiet_cmd_tprog-csingle = CC $@ + cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE + $(call if_changed_dep,tprog-csingle) + +# Link an executable based on list of .o files, all plain c +# tprog-cmulti -> executable +quiet_cmd_tprog-cmulti = LD $@ + cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \ + $(addprefix $(obj)/,$($(@F)-objs)) \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-cmulti): $(tprog-cobjs) FORCE + $(call if_changed,tprog-cmulti) +$(call multi_depend, $(tprog-cmulti), , -objs) + +# Create .o file from a single .c file +# tprog-cobjs -> .o +quiet_cmd_tprog-cobjs = CC $@ + cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< +$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,tprog-cobjs) diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index 79f9a58f1872..cabe2d216997 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -4,15 +4,39 @@ eBPF sample programs This directory contains a test stubs, verifier test-suite and examples for using eBPF. The examples use libbpf from tools/lib/bpf. +Note that the XDP-specific samples have been removed from this directory and +moved to the xdp-tools repository: https://github.com/xdp-project/xdp-tools +See the commit messages removing each tool from this directory for how to +convert specific command invocations between the old samples and the utilities +in xdp-tools. + Build dependencies ================== Compiling requires having installed: - * clang >= version 3.4.0 - * llvm >= version 3.7.1 + * clang + * llvm + * pahole + +Consult :ref:`Documentation/process/changes.rst <changes>` for the minimum +version numbers required and how to update them. Note that LLVM's tool +'llc' must support target 'bpf', list version and supported targets with +command: ``llc --version`` + +Clean and configuration +----------------------- + +It can be needed to clean tools, samples or kernel before trying new arch or +after some changes (on demand):: -Note that LLVM's tool 'llc' must support target 'bpf', list version -and supported targets with command: ``llc --version`` + make -C tools clean + make -C samples/bpf clean + make clean + +Configure kernel, defconfig for instance +(see "tools/testing/selftests/bpf/config" for a reference config):: + + make defconfig Kernel headers -------------- @@ -23,8 +47,8 @@ user, simply call:: make headers_install -This will creates a local "usr/include" directory in the git/build top -level directory, that the make system automatically pickup first. +This will create a local "usr/include" directory in the git/build top +level directory, that the make system will automatically pick up first. Compiling ========= @@ -32,12 +56,10 @@ Compiling For building the BPF samples, issue the below command from the kernel top level directory:: - make samples/bpf/ - -Do notice the "/" slash after the directory name. + make M=samples/bpf It is also possible to call make from this directory. This will just -hide the the invocation of make as above with the appended "/". +hide the invocation of make as above. Manually compiling LLVM with 'bpf' support ------------------------------------------ @@ -50,17 +72,50 @@ To generate a smaller llc binary one can use:: -DLLVM_TARGETS_TO_BUILD="BPF" +We recommend that developers who want the fastest incremental builds +use the Ninja build system, you can find it in your system's package +manager, usually the package is ninja or ninja-build. + Quick sniplet for manually compiling LLVM and clang -(build dependencies are cmake and gcc-c++):: +(build dependencies are ninja, cmake and gcc-c++):: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" - $ make -j $(getconf _NPROCESSORS_ONLN) + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_BUILD_RUNTIME=OFF + $ ninja It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: - make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. But do this before clean, +configuration and header install steps described above. This will direct make to +build samples for the cross target:: + + export ARCH=arm64 + export CROSS_COMPILE="aarch64-linux-gnu-" + +Headers can be also installed on RFS of target board if need to keep them in +sync (not necessarily and it creates a local "usr/include" directory also):: + + make INSTALL_HDR_PATH=~/some_sysroot/usr headers_install + +Pointing LLC and CLANG is not necessarily if it's installed on HOST and have +in its targets appropriate arm64 arch (usually it has several arches). +Build samples:: + + make M=samples/bpf + +Or build samples with SYSROOT if some header or library is absent in toolchain, +say libelf, providing address to file system containing headers and libs, +can be RFS of target board:: + + make M=samples/bpf SYSROOT=~/some_sysroot diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h new file mode 100644 index 000000000000..634e81d83efd --- /dev/null +++ b/samples/bpf/asm_goto_workaround.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef __ASM_GOTO_WORKAROUND_H +#define __ASM_GOTO_WORKAROUND_H + +/* + * This will bring in asm_goto_output and asm_inline macro definitions + * if enabled by compiler and config options. + */ +#include <linux/types.h> + +#ifdef asm_goto_output +#undef asm_goto_output +#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output") +#endif + +/* + * asm_inline is defined as asm __inline in "include/linux/compiler_types.h" + * if supported by the kernel's CC (i.e CONFIG_CC_HAS_ASM_INLINE) which is not + * supported by CLANG. + */ +#ifdef asm_inline +#undef asm_inline +#define asm_inline asm +#endif + +#define volatile(x...) volatile("") +#endif diff --git a/samples/bpf/libbpf.h b/samples/bpf/bpf_insn.h index 8ab36a04c174..29c3bb6ad1cd 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/bpf_insn.h @@ -1,8 +1,7 @@ -/* eBPF mini library */ -#ifndef __LIBBPF_H -#define __LIBBPF_H - -#include <bpf/bpf.h> +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* eBPF instruction mini library */ +#ifndef __BPF_INSN_H +#define __BPF_INSN_H struct bpf_insn; @@ -135,15 +134,31 @@ struct bpf_insn; .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ - -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ - ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ @@ -165,6 +180,16 @@ struct bpf_insn; .off = OFF, \ .imm = 0 }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ @@ -175,6 +200,16 @@ struct bpf_insn; .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c deleted file mode 100644 index 899f40310bc3..000000000000 --- a/samples/bpf/bpf_load.c +++ /dev/null @@ -1,777 +0,0 @@ -#include <stdio.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <libelf.h> -#include <gelf.h> -#include <errno.h> -#include <unistd.h> -#include <string.h> -#include <stdbool.h> -#include <stdlib.h> -#include <linux/bpf.h> -#include <linux/filter.h> -#include <linux/perf_event.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> -#include <linux/types.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/mman.h> -#include <poll.h> -#include <ctype.h> -#include <assert.h> -#include "libbpf.h" -#include "bpf_load.h" -#include "perf-sys.h" - -#define DEBUGFS "/sys/kernel/debug/tracing/" - -static char license[128]; -static int kern_version; -static bool processed_sec[128]; -char bpf_log_buf[BPF_LOG_BUF_SIZE]; -int map_fd[MAX_MAPS]; -int prog_fd[MAX_PROGS]; -int event_fd[MAX_PROGS]; -int prog_cnt; -int prog_array_fd = -1; - -struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count = 0; - -static int populate_prog_array(const char *event, int prog_fd) -{ - int ind = atoi(event), err; - - err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); - if (err < 0) { - printf("failed to store prog_fd in prog_array\n"); - return -1; - } - return 0; -} - -static int load_and_attach(const char *event, struct bpf_insn *prog, int size) -{ - bool is_socket = strncmp(event, "socket", 6) == 0; - bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; - bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; - bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; - bool is_xdp = strncmp(event, "xdp", 3) == 0; - bool is_perf_event = strncmp(event, "perf_event", 10) == 0; - bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; - bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; - bool is_sockops = strncmp(event, "sockops", 7) == 0; - size_t insns_cnt = size / sizeof(struct bpf_insn); - enum bpf_prog_type prog_type; - char buf[256]; - int fd, efd, err, id; - struct perf_event_attr attr = {}; - - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - if (is_socket) { - prog_type = BPF_PROG_TYPE_SOCKET_FILTER; - } else if (is_kprobe || is_kretprobe) { - prog_type = BPF_PROG_TYPE_KPROBE; - } else if (is_tracepoint) { - prog_type = BPF_PROG_TYPE_TRACEPOINT; - } else if (is_xdp) { - prog_type = BPF_PROG_TYPE_XDP; - } else if (is_perf_event) { - prog_type = BPF_PROG_TYPE_PERF_EVENT; - } else if (is_cgroup_skb) { - prog_type = BPF_PROG_TYPE_CGROUP_SKB; - } else if (is_cgroup_sk) { - prog_type = BPF_PROG_TYPE_CGROUP_SOCK; - } else if (is_sockops) { - prog_type = BPF_PROG_TYPE_SOCK_OPS; - } else { - printf("Unknown event '%s'\n", event); - return -1; - } - - fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, - bpf_log_buf, BPF_LOG_BUF_SIZE); - if (fd < 0) { - printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); - return -1; - } - - prog_fd[prog_cnt++] = fd; - - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) - return 0; - - if (is_socket || is_sockops) { - if (is_socket) - event += 6; - else - event += 7; - if (*event != '/') - return 0; - event++; - if (!isdigit(*event)) { - printf("invalid prog number\n"); - return -1; - } - return populate_prog_array(event, fd); - } - - if (is_kprobe || is_kretprobe) { - if (is_kprobe) - event += 7; - else - event += 10; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - - if (isdigit(*event)) - return populate_prog_array(event, fd); - - snprintf(buf, sizeof(buf), - "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", - is_kprobe ? 'p' : 'r', event, event); - err = system(buf); - if (err < 0) { - printf("failed to create kprobe '%s' error '%s'\n", - event, strerror(errno)); - return -1; - } - - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event); - strcat(buf, "/id"); - } else if (is_tracepoint) { - event += 11; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/"); - strcat(buf, event); - strcat(buf, "/id"); - } - - efd = open(buf, O_RDONLY, 0); - if (efd < 0) { - printf("failed to open event %s\n", event); - return -1; - } - - err = read(efd, buf, sizeof(buf)); - if (err < 0 || err >= sizeof(buf)) { - printf("read from '%s' failed '%s'\n", event, strerror(errno)); - return -1; - } - - close(efd); - - buf[err] = 0; - id = atoi(buf); - attr.config = id; - - efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); - if (efd < 0) { - printf("event %d fd %d err %s\n", id, efd, strerror(errno)); - return -1; - } - event_fd[prog_cnt - 1] = efd; - ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); - ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); - - return 0; -} - -static int load_maps(struct bpf_map_data *maps, int nr_maps, - fixup_map_cb fixup_map) -{ - int i; - - for (i = 0; i < nr_maps; i++) { - if (fixup_map) { - fixup_map(&maps[i], i); - /* Allow userspace to assign map FD prior to creation */ - if (maps[i].fd != -1) { - map_fd[i] = maps[i].fd; - continue; - } - } - - if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - - map_fd[i] = bpf_create_map_in_map(maps[i].def.type, - maps[i].def.key_size, - inner_map_fd, - maps[i].def.max_entries, - maps[i].def.map_flags); - } else { - map_fd[i] = bpf_create_map(maps[i].def.type, - maps[i].def.key_size, - maps[i].def.value_size, - maps[i].def.max_entries, - maps[i].def.map_flags); - } - if (map_fd[i] < 0) { - printf("failed to create a map: %d %s\n", - errno, strerror(errno)); - return 1; - } - maps[i].fd = map_fd[i]; - - if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) - prog_array_fd = map_fd[i]; - } - return 0; -} - -static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, - GElf_Shdr *shdr, Elf_Data **data) -{ - Elf_Scn *scn; - - scn = elf_getscn(elf, i); - if (!scn) - return 1; - - if (gelf_getshdr(scn, shdr) != shdr) - return 2; - - *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); - if (!*shname || !shdr->sh_size) - return 3; - - *data = elf_getdata(scn, 0); - if (!*data || elf_getdata(scn, *data) != NULL) - return 4; - - return 0; -} - -static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn, - struct bpf_map_data *maps, int nr_maps) -{ - int i, nrels; - - nrels = shdr->sh_size / shdr->sh_entsize; - - for (i = 0; i < nrels; i++) { - GElf_Sym sym; - GElf_Rel rel; - unsigned int insn_idx; - bool match = false; - int j, map_idx; - - gelf_getrel(data, i, &rel); - - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - - gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); - - if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - printf("invalid relo for insn[%d].code 0x%x\n", - insn_idx, insn[insn_idx].code); - return 1; - } - insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - - /* Match FD relocation against recorded map_data[] offset */ - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].elf_offset == sym.st_value) { - match = true; - break; - } - } - if (match) { - insn[insn_idx].imm = maps[map_idx].fd; - } else { - printf("invalid relo for insn[%d] no map_data match\n", - insn_idx); - return 1; - } - } - - return 0; -} - -static int cmp_symbols(const void *l, const void *r) -{ - const GElf_Sym *lsym = (const GElf_Sym *)l; - const GElf_Sym *rsym = (const GElf_Sym *)r; - - if (lsym->st_value < rsym->st_value) - return -1; - else if (lsym->st_value > rsym->st_value) - return 1; - else - return 0; -} - -static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, - Elf *elf, Elf_Data *symbols, int strtabidx) -{ - int map_sz_elf, map_sz_copy; - bool validate_zero = false; - Elf_Data *data_maps; - int i, nr_maps; - GElf_Sym *sym; - Elf_Scn *scn; - int copy_sz; - - if (maps_shndx < 0) - return -EINVAL; - if (!symbols) - return -EINVAL; - - /* Get data for maps section via elf index */ - scn = elf_getscn(elf, maps_shndx); - if (scn) - data_maps = elf_getdata(scn, NULL); - if (!scn || !data_maps) { - printf("Failed to get Elf_Data from maps section %d\n", - maps_shndx); - return -EINVAL; - } - - /* For each map get corrosponding symbol table entry */ - sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); - for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - assert(nr_maps < MAX_MAPS+1); - if (!gelf_getsym(symbols, i, &sym[nr_maps])) - continue; - if (sym[nr_maps].st_shndx != maps_shndx) - continue; - /* Only increment iif maps section */ - nr_maps++; - } - - /* Align to map_fd[] order, via sort on offset in sym.st_value */ - qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); - - /* Keeping compatible with ELF maps section changes - * ------------------------------------------------ - * The program size of struct bpf_map_def is known by loader - * code, but struct stored in ELF file can be different. - * - * Unfortunately sym[i].st_size is zero. To calculate the - * struct size stored in the ELF file, assume all struct have - * the same size, and simply divide with number of map - * symbols. - */ - map_sz_elf = data_maps->d_size / nr_maps; - map_sz_copy = sizeof(struct bpf_map_def); - if (map_sz_elf < map_sz_copy) { - /* - * Backward compat, loading older ELF file with - * smaller struct, keeping remaining bytes zero. - */ - map_sz_copy = map_sz_elf; - } else if (map_sz_elf > map_sz_copy) { - /* - * Forward compat, loading newer ELF file with larger - * struct with unknown features. Assume zero means - * feature not used. Thus, validate rest of struct - * data is zero. - */ - validate_zero = true; - } - - /* Memcpy relevant part of ELF maps data to loader maps */ - for (i = 0; i < nr_maps; i++) { - unsigned char *addr, *end; - struct bpf_map_def *def; - const char *map_name; - size_t offset; - - map_name = elf_strptr(elf, strtabidx, sym[i].st_name); - maps[i].name = strdup(map_name); - if (!maps[i].name) { - printf("strdup(%s): %s(%d)\n", map_name, - strerror(errno), errno); - free(sym); - return -errno; - } - - /* Symbol value is offset into ELF maps section data area */ - offset = sym[i].st_value; - def = (struct bpf_map_def *)(data_maps->d_buf + offset); - maps[i].elf_offset = offset; - memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); - memcpy(&maps[i].def, def, map_sz_copy); - - /* Verify no newer features were requested */ - if (validate_zero) { - addr = (unsigned char*) def + map_sz_copy; - end = (unsigned char*) def + map_sz_elf; - for (; addr < end; addr++) { - if (*addr != 0) { - free(sym); - return -EFBIG; - } - } - } - } - - free(sym); - return nr_maps; -} - -static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) -{ - int fd, i, ret, maps_shndx = -1, strtabidx = -1; - Elf *elf; - GElf_Ehdr ehdr; - GElf_Shdr shdr, shdr_prog; - Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; - char *shname, *shname_prog; - int nr_maps = 0; - - /* reset global variables */ - kern_version = 0; - memset(license, 0, sizeof(license)); - memset(processed_sec, 0, sizeof(processed_sec)); - - if (elf_version(EV_CURRENT) == EV_NONE) - return 1; - - fd = open(path, O_RDONLY, 0); - if (fd < 0) - return 1; - - elf = elf_begin(fd, ELF_C_READ, NULL); - - if (!elf) - return 1; - - if (gelf_getehdr(elf, &ehdr) != &ehdr) - return 1; - - /* clear all kprobes */ - i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); - - /* scan over all elf sections to get license and map info */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (0) /* helpful for llvm debugging */ - printf("section %d:%s data %p size %zd link %d flags %d\n", - i, shname, data->d_buf, data->d_size, - shdr.sh_link, (int) shdr.sh_flags); - - if (strcmp(shname, "license") == 0) { - processed_sec[i] = true; - memcpy(license, data->d_buf, data->d_size); - } else if (strcmp(shname, "version") == 0) { - processed_sec[i] = true; - if (data->d_size != sizeof(int)) { - printf("invalid size of version section %zd\n", - data->d_size); - return 1; - } - memcpy(&kern_version, data->d_buf, sizeof(int)); - } else if (strcmp(shname, "maps") == 0) { - int j; - - maps_shndx = i; - data_maps = data; - for (j = 0; j < MAX_MAPS; j++) - map_data[j].fd = -1; - } else if (shdr.sh_type == SHT_SYMTAB) { - strtabidx = shdr.sh_link; - symbols = data; - } - } - - ret = 1; - - if (!symbols) { - printf("missing SHT_SYMTAB section\n"); - goto done; - } - - if (data_maps) { - nr_maps = load_elf_maps_section(map_data, maps_shndx, - elf, symbols, strtabidx); - if (nr_maps < 0) { - printf("Error: Failed loading ELF maps (errno:%d):%s\n", - nr_maps, strerror(-nr_maps)); - ret = 1; - goto done; - } - if (load_maps(map_data, nr_maps, fixup_map)) - goto done; - map_data_count = nr_maps; - - processed_sec[maps_shndx] = true; - } - - /* process all relo sections, and rewrite bpf insns for maps */ - for (i = 1; i < ehdr.e_shnum; i++) { - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (shdr.sh_type == SHT_REL) { - struct bpf_insn *insns; - - /* locate prog sec that need map fixup (relocations) */ - if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, - &shdr_prog, &data_prog)) - continue; - - if (shdr_prog.sh_type != SHT_PROGBITS || - !(shdr_prog.sh_flags & SHF_EXECINSTR)) - continue; - - insns = (struct bpf_insn *) data_prog->d_buf; - processed_sec[i] = true; /* relo section */ - - if (parse_relo_and_apply(data, symbols, &shdr, insns, - map_data, nr_maps)) - continue; - } - } - - /* load programs */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (memcmp(shname, "kprobe/", 7) == 0 || - memcmp(shname, "kretprobe/", 10) == 0 || - memcmp(shname, "tracepoint/", 11) == 0 || - memcmp(shname, "xdp", 3) == 0 || - memcmp(shname, "perf_event", 10) == 0 || - memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0 || - memcmp(shname, "sockops", 7) == 0) { - ret = load_and_attach(shname, data->d_buf, - data->d_size); - if (ret != 0) - goto done; - } - } - - ret = 0; -done: - close(fd); - return ret; -} - -int load_bpf_file(char *path) -{ - return do_load_bpf_file(path, NULL); -} - -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) -{ - return do_load_bpf_file(path, fixup_map); -} - -void read_trace_pipe(void) -{ - int trace_fd; - - trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); - if (trace_fd < 0) - return; - - while (1) { - static char buf[4096]; - ssize_t sz; - - sz = read(trace_fd, buf, sizeof(buf)); - if (sz > 0) { - buf[sz] = 0; - puts(buf); - } - } -} - -#define MAX_SYMS 300000 -static struct ksym syms[MAX_SYMS]; -static int sym_cnt; - -static int ksym_cmp(const void *p1, const void *p2) -{ - return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; -} - -int load_kallsyms(void) -{ - FILE *f = fopen("/proc/kallsyms", "r"); - char func[256], buf[256]; - char symbol; - void *addr; - int i = 0; - - if (!f) - return -ENOENT; - - while (!feof(f)) { - if (!fgets(buf, sizeof(buf), f)) - break; - if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) - break; - if (!addr) - continue; - syms[i].addr = (long) addr; - syms[i].name = strdup(func); - i++; - } - sym_cnt = i; - qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); - return 0; -} - -struct ksym *ksym_search(long key) -{ - int start = 0, end = sym_cnt; - int result; - - while (start < end) { - size_t mid = start + (end - start) / 2; - - result = key - syms[mid].addr; - if (result < 0) - end = mid; - else if (result > 0) - start = mid + 1; - else - return &syms[mid]; - } - - if (start >= 1 && syms[start - 1].addr < key && - key < syms[start].addr) - /* valid ksym */ - return &syms[start - 1]; - - /* out of range. return _stext */ - return &syms[0]; -} - -int set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - printf("send to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - printf("recv from netlink: %s\n", strerror(errno)); - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != getpid()) { - printf("Wrong pid %d, expected %d\n", - nh->nlmsg_pid, getpid()); - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - printf("Wrong seq %d, expected %d\n", - nh->nlmsg_seq, seq); - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - printf("nlmsg error %s\n", strerror(-err->error)); - goto cleanup; - case NLMSG_DONE: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h deleted file mode 100644 index ca0563d04744..000000000000 --- a/samples/bpf/bpf_load.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef __BPF_LOAD_H -#define __BPF_LOAD_H - -#include "libbpf.h" - -#define MAX_MAPS 32 -#define MAX_PROGS 32 - -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; -}; - -struct bpf_map_data { - int fd; - char *name; - size_t elf_offset; - struct bpf_map_def def; -}; - -typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); - -extern int prog_fd[MAX_PROGS]; -extern int event_fd[MAX_PROGS]; -extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; -extern int prog_cnt; - -/* There is a one-to-one mapping between map_fd[] and map_data[]. - * The map_data[] just contains more rich info on the given map. - */ -extern int map_fd[MAX_MAPS]; -extern struct bpf_map_data map_data[MAX_MAPS]; -extern int map_data_count; - -/* parses elf file compiled by llvm .c->.o - * . parses 'maps' section and creates maps via BPF syscall - * . parses 'license' section and passes it to syscall - * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by - * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD - * . loads eBPF programs via BPF syscall - * - * One ELF file can contain multiple BPF programs which will be loaded - * and their FDs stored stored in prog_fd array - * - * returns zero on success - */ -int load_bpf_file(char *path); -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); - -void read_trace_pipe(void); -struct ksym { - long addr; - char *name; -}; - -int load_kallsyms(void); -struct ksym *ksym_search(long key); -int set_link_xdp_fd(int ifindex, int fd, __u32 flags); -#endif diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c deleted file mode 100644 index 9d1be9426401..000000000000 --- a/samples/bpf/cgroup_helpers.c +++ /dev/null @@ -1,177 +0,0 @@ -#define _GNU_SOURCE -#include <sched.h> -#include <sys/mount.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <linux/limits.h> -#include <stdio.h> -#include <linux/sched.h> -#include <fcntl.h> -#include <unistd.h> -#include <ftw.h> - - -#include "cgroup_helpers.h" - -/* - * To avoid relying on the system setup, when setup_cgroup_env is called - * we create a new mount namespace, and cgroup namespace. The cgroup2 - * root is mounted at CGROUP_MOUNT_PATH - * - * Unfortunately, most people don't have cgroupv2 enabled at this point in time. - * It's easier to create our own mount namespace and manage it ourselves. - * - * We assume /mnt exists. - */ - -#define WALK_FD_LIMIT 16 -#define CGROUP_MOUNT_PATH "/mnt" -#define CGROUP_WORK_DIR "/cgroup-test-work-dir" -#define format_cgroup_path(buf, path) \ - snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \ - CGROUP_WORK_DIR, path) - -/** - * setup_cgroup_environment() - Setup the cgroup environment - * - * After calling this function, cleanup_cgroup_environment should be called - * once testing is complete. - * - * This function will print an error to stderr and return 1 if it is unable - * to setup the cgroup environment. If setup is successful, 0 is returned. - */ -int setup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - - if (unshare(CLONE_NEWNS)) { - log_err("unshare"); - return 1; - } - - if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { - log_err("mount fakeroot"); - return 1; - } - - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) { - log_err("mount cgroup2"); - return 1; - } - - /* Cleanup existing failed runs, now that the environment is setup */ - cleanup_cgroup_environment(); - - if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) { - log_err("mkdir cgroup work dir"); - return 1; - } - - return 0; -} - -static int nftwfunc(const char *filename, const struct stat *statptr, - int fileflags, struct FTW *pfwt) -{ - if ((fileflags & FTW_D) && rmdir(filename)) - log_err("Removing cgroup: %s", filename); - return 0; -} - - -static int join_cgroup_from_top(char *cgroup_path) -{ - char cgroup_procs_path[PATH_MAX + 1]; - pid_t pid = getpid(); - int fd, rc = 0; - - snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), - "%s/cgroup.procs", cgroup_path); - - fd = open(cgroup_procs_path, O_WRONLY); - if (fd < 0) { - log_err("Opening Cgroup Procs: %s", cgroup_procs_path); - return 1; - } - - if (dprintf(fd, "%d\n", pid) < 0) { - log_err("Joining Cgroup"); - rc = 1; - } - - close(fd); - return rc; -} - -/** - * join_cgroup() - Join a cgroup - * @path: The cgroup path, relative to the workdir, to join - * - * This function expects a cgroup to already be created, relative to the cgroup - * work dir, and it joins it. For example, passing "/my-cgroup" as the path - * would actually put the calling process into the cgroup - * "/cgroup-test-work-dir/my-cgroup" - * - * On success, it returns 0, otherwise on failure it returns 1. - */ -int join_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - - format_cgroup_path(cgroup_path, path); - return join_cgroup_from_top(cgroup_path); -} - -/** - * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment - * - * This is an idempotent function to delete all temporary cgroups that - * have been created during the test, including the cgroup testing work - * directory. - * - * At call time, it moves the calling process to the root cgroup, and then - * runs the deletion process. It is idempotent, and should not fail, unless - * a process is lingering. - * - * On failure, it will print an error to stderr, and try to continue. - */ -void cleanup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - join_cgroup_from_top(CGROUP_MOUNT_PATH); - nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT); -} - -/** - * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD - * @path: The cgroup path, relative to the workdir, to join - * - * This function creates a cgroup under the top level workdir and returns the - * file descriptor. It is idempotent. - * - * On success, it returns the file descriptor. On failure it returns 0. - * If there is a failure, it prints the error to stderr. - */ -int create_and_get_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - int fd; - - format_cgroup_path(cgroup_path, path); - if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup"); - return 0; - } - - fd = open(cgroup_path, O_RDONLY); - if (fd < 0) { - log_err("Opening Cgroup"); - return 0; - } - - return fd; -} diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h deleted file mode 100644 index 78c55207b6bd..000000000000 --- a/samples/bpf/cgroup_helpers.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __CGROUP_HELPERS_H -#define __CGROUP_HELPERS_H -#include <errno.h> -#include <string.h> - -#define clean_errno() (errno == 0 ? "None" : strerror(errno)) -#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ - __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) - - -int create_and_get_cgroup(char *path); -int join_cgroup(char *path); -int setup_cgroup_environment(void); -void cleanup_cgroup_environment(void); - -#endif diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index 9d751e209f31..f0df3dda4b1f 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -51,7 +51,7 @@ #include <sys/types.h> #include <unistd.h> #include <bpf/bpf.h> -#include "libbpf.h" +#include "bpf_insn.h" #define PORT 8888 @@ -67,8 +67,8 @@ static bool test_finish; static void maps_create(void) { - map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), - sizeof(struct stats), 100, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), + sizeof(struct stats), 100, NULL); if (map_fd < 0) error(1, errno, "map create failed!\n"); } @@ -147,19 +147,23 @@ static void prog_load(void) */ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, packets)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, bytes)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), }; - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, - ARRAY_SIZE(prog), "GPL", 0, - log_buf, sizeof(log_buf)); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = sizeof(log_buf), + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, ARRAY_SIZE(prog), &opts); if (prog_fd < 0) error(1, errno, "failed to load prog\n%s\n", log_buf); } @@ -167,7 +171,7 @@ static void prog_load(void) static void prog_attach_iptables(char *file) { int ret; - char rules[100]; + char rules[256]; if (bpf_obj_pin(prog_fd, file)) error(1, errno, "bpf_obj_pin"); @@ -175,8 +179,13 @@ static void prog_attach_iptables(char *file) printf("file path too long: %s\n", file); exit(1); } - sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", - file); + ret = snprintf(rules, sizeof(rules), + "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", + file); + if (ret < 0 || ret >= sizeof(rules)) { + printf("error constructing iptables command\n"); + exit(1); + } ret = system(rules); if (ret < 0) { printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); @@ -246,7 +255,7 @@ static void udp_client(void) recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, (struct sockaddr *)&si_me, &slen); if (recv_len < 0) - error(1, errno, "revieve\n"); + error(1, errno, "receive\n"); res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), sizeof(si_me.sin_addr)); if (res != 0) @@ -313,7 +322,7 @@ int main(int argc, char *argv[]) print_table(); printf("\n"); sleep(1); - }; + } } else if (cfg_test_cookie) { udp_client(); } diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c new file mode 100644 index 000000000000..7ec7143e2757 --- /dev/null +++ b/samples/bpf/cpustat_kern.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* + * The CPU number, cstate number and pstate number are based + * on 96boards Hikey with octa CA53 CPUs. + * + * Every CPU have three idle states for cstate: + * WFI, CPU_OFF, CLUSTER_OFF + * + * Every CPU have 5 operating points: + * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz + * + * This code is based on these assumption and other platforms + * need to adjust these definitions. + */ +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 + +static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; + +/* + * my_map structure is used to record cstate and pstate index and + * timestamp (Idx, Ts), when new event incoming we need to update + * combination for new state index and timestamp (Idx`, Ts`). + * + * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time + * interval for the previous state: Duration(Idx) = Ts` - Ts. + * + * Every CPU has one below array for recording state index and + * timestamp, and record for cstate and pstate saperately: + * + * +--------------------------+ + * | cstate timestamp | + * +--------------------------+ + * | cstate index | + * +--------------------------+ + * | pstate timestamp | + * +--------------------------+ + * | pstate index | + * +--------------------------+ + */ +#define MAP_OFF_CSTATE_TIME 0 +#define MAP_OFF_CSTATE_IDX 1 +#define MAP_OFF_PSTATE_TIME 2 +#define MAP_OFF_PSTATE_IDX 3 +#define MAP_OFF_NUM 4 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAP_OFF_NUM); +} my_map SEC(".maps"); + +/* cstate_duration records duration time for every idle state per CPU */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); +} cstate_duration SEC(".maps"); + +/* pstate_duration records duration time for every operating point per CPU */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); +} pstate_duration SEC(".maps"); + +/* + * The trace events for cpu_idle and cpu_frequency are taken from: + * /sys/kernel/tracing/events/power/cpu_idle/format + * /sys/kernel/tracing/events/power/cpu_frequency/format + * + * These two events have same format, so define one common structure. + */ +struct cpu_args { + u64 pad; + u32 state; + u32 cpu_id; +}; + +/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ +static u32 find_cpu_pstate_idx(u32 frequency) +{ + u32 i; + + for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { + if (frequency == cpu_opps[i]) + return i; + } + + return i; +} + +SEC("tracepoint/power/cpu_idle") +int bpf_prog1(struct cpu_args *ctx) +{ + u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + if (ctx->cpu_id > MAX_CPU) + return 0; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; + cts = bpf_map_lookup_elem(&my_map, &key); + if (!cts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + prev_state = *cstate; + *cstate = ctx->state; + + if (!*cts) { + *cts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *cts; + *cts = cur_ts; + + /* + * When state doesn't equal to (u32)-1, the cpu will enter + * one idle state; for this case we need to record interval + * for the pstate. + * + * OPP2 + * +---------------------+ + * OPP1 | | + * ---------+ | + * | Idle state + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + if (ctx->state != (u32)-1) { + + /* record pstate after have first cpu_frequency event */ + if (!*pts) + return 0; + + delta = cur_ts - *pts; + + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + /* + * When state equal to (u32)-1, the cpu just exits from one + * specific idle state; for this case we need to record + * interval for the pstate. + * + * OPP2 + * -----------+ + * | OPP1 + * | +----------- + * | Idle state | + * +---------------------+ + * + * |<- cstate duration ->| + * ^ ^ + * cts cur_ts + */ + } else { + + key = cpu * MAX_CSTATE_ENTRIES + prev_state; + val = bpf_map_lookup_elem(&cstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + } + + /* Update timestamp for pstate as new start time */ + if (*pts) + *pts = cur_ts; + + return 0; +} + +SEC("tracepoint/power/cpu_frequency") +int bpf_prog2(struct cpu_args *ctx) +{ + u64 *pts, *cstate, *pstate, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + *pstate = ctx->state; + + if (!*pts) { + *pts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *pts; + *pts = cur_ts; + + /* When CPU is in idle, bail out to skip pstate statistics */ + if (*cstate != (u32)(-1)) + return 0; + + /* + * The cpu changes to another different OPP (in below diagram + * change frequency from OPP3 to OPP1), need recording interval + * for previous frequency OPP3 and update timestamp as start + * time for new frequency OPP1. + * + * OPP3 + * +---------------------+ + * OPP2 | | + * ---------+ | + * | OPP1 + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c new file mode 100644 index 000000000000..356f756cba0d --- /dev/null +++ b/samples/bpf/cpustat_user.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sched.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <locale.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/wait.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +static int cstate_map_fd, pstate_map_fd; + +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 +#define MAX_STARS 40 + +#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq" +#define CPUFREQ_LOWEST_FREQ "208000" +#define CPUFREQ_HIGHEST_FREQ "12000000" + +struct cpu_stat_data { + unsigned long cstate[MAX_CSTATE_ENTRIES]; + unsigned long pstate[MAX_PSTATE_ENTRIES]; +}; + +static struct cpu_stat_data stat_data[MAX_CPU]; + +static void cpu_stat_print(void) +{ + int i, j; + char state_str[sizeof("cstate-9")]; + struct cpu_stat_data *data; + + /* Clear screen */ + printf("\033[2J"); + + /* Header */ + printf("\nCPU states statistics:\n"); + printf("%-10s ", "state(ms)"); + + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + sprintf(state_str, "cstate-%d", i); + printf("%-11s ", state_str); + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + sprintf(state_str, "pstate-%d", i); + printf("%-11s ", state_str); + } + + printf("\n"); + + for (j = 0; j < MAX_CPU; j++) { + data = &stat_data[j]; + + printf("CPU-%-6d ", j); + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) + printf("%-11lu ", data->cstate[i] / 1000000); + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) + printf("%-11lu ", data->pstate[i] / 1000000); + + printf("\n"); + } +} + +static void cpu_stat_update(int cstate_fd, int pstate_fd) +{ + unsigned long key, value; + int c, i; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + key = c * MAX_CSTATE_ENTRIES + i; + bpf_map_lookup_elem(cstate_fd, &key, &value); + stat_data[c].cstate[i] = value; + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + key = c * MAX_PSTATE_ENTRIES + i; + bpf_map_lookup_elem(pstate_fd, &key, &value); + stat_data[c].pstate[i] = value; + } + } +} + +/* + * This function is copied from 'idlestat' tool function + * idlestat_wake_all() in idlestate.c. + * + * It sets the self running task affinity to cpus one by one so can wake up + * the specific CPU to handle scheduling; this results in all cpus can be + * waken up once and produce ftrace event 'trace_cpu_idle'. + */ +static int cpu_stat_inject_cpu_idle_event(void) +{ + int rcpu, i, ret; + cpu_set_t cpumask; + cpu_set_t original_cpumask; + + ret = sysconf(_SC_NPROCESSORS_CONF); + if (ret < 0) + return -1; + + rcpu = sched_getcpu(); + if (rcpu < 0) + return -1; + + /* Keep track of the CPUs we will run on */ + sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask); + + for (i = 0; i < ret; i++) { + + /* Pointless to wake up ourself */ + if (i == rcpu) + continue; + + /* Pointless to wake CPUs we will not run on */ + if (!CPU_ISSET(i, &original_cpumask)) + continue; + + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + sched_setaffinity(0, sizeof(cpumask), &cpumask); + } + + /* Enable all the CPUs of the original mask */ + sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask); + return 0; +} + +/* + * It's possible to have no any frequency change for long time and cannot + * get ftrace event 'trace_cpu_frequency' for long period, this introduces + * big deviation for pstate statistics. + * + * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz + * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to + * the maximum frequency value 1.2GHz. + */ +static int cpu_stat_inject_cpu_frequency_event(void) +{ + int len, fd; + + fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY); + if (fd < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + return fd; + } + + len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + + len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + +err: + close(fd); + return len; +} + +static void int_exit(int sig) +{ + cpu_stat_inject_cpu_idle_event(); + cpu_stat_inject_cpu_frequency_event(); + cpu_stat_update(cstate_map_fd, pstate_map_fd); + cpu_stat_print(); + exit(0); +} + +int main(int argc, char **argv) +{ + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration"); + pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration"); + if (cstate_map_fd < 0 || pstate_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + ret = cpu_stat_inject_cpu_idle_event(); + if (ret < 0) + return 1; + + ret = cpu_stat_inject_cpu_frequency_event(); + if (ret < 0) + return 1; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (1) { + cpu_stat_update(cstate_map_fd, pstate_map_fd); + cpu_stat_print(); + sleep(5); + } + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return 0; +} diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000000..7f4f722787d5 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh @@ -0,0 +1,438 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 Facebook +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. + +Usage() { + echo "Script for testing HBM (Host Bandwidth Manager) framework." + echo "It creates a cgroup to use for testing and load a BPF program to limit" + echo "egress or ingress bandwidth. It then uses iperf3 or netperf to create" + echo "loads. The output is the goodput in Mbps (unless -D was used)." + echo "" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" + echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]" + echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" + echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" + echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" + echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" + echo " Where:" + echo " out egress (default)" + echo " -b or --bpf BPF program filename to load and attach." + echo " Default is hbm_out_kern.o for egress," + echo " -c or -cc TCP congestion control (cubic or dctcp)" + echo " --debug print BPF trace buffer" + echo " -d or --delay add a delay in ms using netem" + echo " -D In addition to the goodput in Mbps, it also outputs" + echo " other detailed information. This information is" + echo " test dependent (i.e. iperf3 or netperf)." + echo " -E enable ECN (not required for dctcp)" + echo " --edt use fq's Earliest Departure Time (requires fq)" + echo " -f or --flows number of concurrent flows (default=1)" + echo " -i or --id cgroup id (an integer, default is 1)" + echo " -N use netperf instead of iperf3" + echo " --no_cn Do not return CN notifications" + echo " -l do not limit flows using loopback" + echo " -h Help" + echo " -p or --port iperf3 port (default is 5201)" + echo " -P use an iperf3 instance for each flow" + echo " -q use the specified qdisc" + echo " -r or --rate rate in Mbps (default 1s 1Gbps)" + echo " -R Use TCP_RR for netperf. 1st flow has req" + echo " size of 10KB, rest of 1MB. Reply in all" + echo " cases is 1 byte." + echo " More detailed output for each flow can be found" + echo " in the files netperf.<cg>.<flow>, where <cg> is the" + echo " cgroup id as specified with the -i flag, and <flow>" + echo " is the flow id starting at 1 and increasing by 1 for" + echo " flow (as specified by -f)." + echo " -s or --server hostname of netperf server. Used to create netperf" + echo " test traffic between to hosts (default is within host)" + echo " netserver must be running on the host." + echo " -S or --stats whether to update hbm stats (default is yes)." + echo " -t or --time duration of iperf3 in seconds (default=5)" + echo " -w Work conserving flag. cgroup can increase its" + echo " bandwidth beyond the rate limit specified" + echo " while there is available bandwidth. Current" + echo " implementation assumes there is only one NIC" + echo " (eth0), but can be extended to support multiple" + echo " NICs." + echo " cubic or dctcp specify which TCP CC to use" + echo " " + exit +} + +#set -x + +debug_flag=0 +args="$@" +name="$0" +netem=0 +cc=x +dir="-o" +dir_name="out" +dur=5 +flows=1 +id=1 +prog="" +port=5201 +rate=1000 +multi_iperf=0 +flow_cnt=1 +use_netperf=0 +rr=0 +ecn=0 +details=0 +server="" +qdisc="" +flags="" +do_stats=0 + +BPFFS=/sys/fs/bpf +function config_bpffs () { + if mount | grep $BPFFS > /dev/null; then + echo "bpffs already mounted" + else + echo "bpffs not mounted. Mounting..." + mount -t bpf none $BPFFS + fi +} + +function start_hbm () { + rm -f hbm.out + echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out + echo " " >> hbm.out + ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & + echo $! +} + +processArgs () { + for i in $args ; do + case $i in + # Support for upcoming ingress rate limiting + #in) # support for upcoming ingress rate limiting + # dir="-i" + # dir_name="in" + # ;; + out) + dir="-o" + dir_name="out" + ;; + -b=*|--bpf=*) + prog="${i#*=}" + ;; + -c=*|--cc=*) + cc="${i#*=}" + ;; + --no_cn) + flags="$flags --no_cn" + ;; + --debug) + flags="$flags -d" + debug_flag=1 + ;; + -d=*|--delay=*) + netem="${i#*=}" + ;; + -D) + details=1 + ;; + -E) + ecn=1 + ;; + --edt) + flags="$flags --edt" + qdisc="fq" + ;; + -f=*|--flows=*) + flows="${i#*=}" + ;; + -i=*|--id=*) + id="${i#*=}" + ;; + -l) + flags="$flags -l" + ;; + -N) + use_netperf=1 + ;; + -p=*|--port=*) + port="${i#*=}" + ;; + -P) + multi_iperf=1 + ;; + -q=*) + qdisc="${i#*=}" + ;; + -r=*|--rate=*) + rate="${i#*=}" + ;; + -R) + rr=1 + ;; + -s=*|--server=*) + server="${i#*=}" + ;; + -S|--stats) + flags="$flags -s" + do_stats=1 + ;; + -t=*|--time=*) + dur="${i#*=}" + ;; + -w) + flags="$flags -w" + ;; + cubic) + cc=cubic + ;; + dctcp) + cc=dctcp + ;; + *) + echo "Unknown arg:$i" + Usage + ;; + esac + done +} + +processArgs +config_bpffs + +if [ $debug_flag -eq 1 ] ; then + rm -f hbm_out.log +fi + +hbm_pid=$(start_hbm) +usleep 100000 + +host=`hostname` +cg_base_dir=/sys/fs/cgroup/unified +cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" + +echo $$ >> $cg_dir/cgroup.procs + +ulimit -l unlimited + +rm -f ss.out +rm -f hbm.[0-9]*.$dir_name +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=1 +fi + +if [ $use_netperf -eq 0 ] ; then + cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` + if [ "$cc" != "x" ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc + fi +fi + +if [ "$netem" -ne "0" ] ; then + if [ "$qdisc" != "" ] ; then + echo "WARNING: Ignoring -q options because -d option used" + fi + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 +elif [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 + tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1 +fi + +n=0 +m=$[$dur * 5] +hn="::1" +if [ $use_netperf -ne 0 ] ; then + if [ "$server" != "" ] ; then + hn=$server + fi +fi + +( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & + +if [ $use_netperf -ne 0 ] ; then + begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ + awk '{ print $1 }'` + if [ "$begNetserverPid" == "" ] ; then + if [ "$server" == "" ] ; then + ( ./netserver > /dev/null 2>&1) & + usleep 100000 + fi + fi + flow_cnt=1 + if [ "$server" == "" ] ; then + np_server=$host + else + np_server=$server + fi + if [ "$cc" == "x" ] ; then + np_cc="" + else + np_cc="-K $cc,$cc" + fi + replySize=1 + while [ $flow_cnt -le $flows ] ; do + if [ $rr -ne 0 ] ; then + reqSize=1M + if [ $flow_cnt -eq 1 ] ; then + reqSize=10K + fi + if [ "$dir" == "-i" ] ; then + replySize=$reqSize + reqSize=1 + fi + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + if [ "$dir" == "-i" ] ; then + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + fi + fi + flow_cnt=$[flow_cnt+1] + done + +# sleep for duration of test (plus some buffer) + n=$[dur+2] + sleep $n + +# force graceful termination of netperf + pids=`pgrep netperf` + for p in $pids ; do + kill -SIGALRM $p + done + + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + while [ $flow_cnt -le $flows ] ; do + if [ "$dir" == "-i" ] ; then + r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + else + r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + fi + echo "rate for flow $flow_cnt: $r" + rate=$[rate+r] + if [ $details -ne 0 ] ; then + echo "-----" + echo "Details for cgroup $id, flow $flow_cnt" + cat netperf.$id.$flow_cnt + fi + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + echo "" + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +elif [ $multi_iperf -eq 0 ] ; then + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + usleep 100000 + iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id + rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` + rate=`echo $rates | grep -o "[0-9]*$"` + + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +else + flow_cnt=1 + while [ $flow_cnt -le $flows ] ; do + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & + port=$[port+1] + flow_cnt=$[flow_cnt+1] + done + n=$[dur+1] + sleep $n + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + + while [ $flow_cnt -le $flows ] ; do + r=`cat iperf3.$id.$flow_cnt` +# echo "rate for flow $flow_cnt: $r" + if [ $details -ne 0 ] ; then + echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" + fi + rate=$[rate+r] + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +fi + +if [ $use_netperf -eq 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc +fi +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=0 +fi +if [ "$netem" -ne "0" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 +fi +if [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 +fi +sleep 2 + +hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` +if [ "$hbmPid" == "$hbm_pid" ] ; then + kill $hbm_pid +fi + +sleep 1 + +# Detach any pinned BPF programs that may have lingered +rm -rf $BPFFS/hbm* + +if [ $use_netperf -ne 0 ] ; then + if [ "$server" == "" ] ; then + if [ "$begNetserverPid" == "" ] ; then + netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` + if [ "$netserverPid" != "" ] ; then + kill $netserverPid + fi + fi + fi +fi +exit diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index e29bd52ff9e8..88a26f3ce201 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -12,9 +12,12 @@ #include <sys/types.h> #include <sys/socket.h> -#include "bpf_load.h" -#include "libbpf.h" +#include <bpf/bpf.h> + +#include <bpf/libbpf.h> +#include "bpf_insn.h" #include "sock_example.h" +#include "bpf_util.h" #define BPF_F_PIN (1 << 0) #define BPF_F_GET (1 << 1) @@ -28,6 +31,8 @@ #define BPF_M_MAP 1 #define BPF_M_PROG 2 +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + static void usage(void) { printf("Usage: fds_example [...]\n"); @@ -42,27 +47,30 @@ static void usage(void) printf(" -h Display this help.\n"); } -static int bpf_map_create(void) -{ - return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), - sizeof(uint32_t), 1024, 0); -} - static int bpf_prog_create(const char *object) { static struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); + size_t insns_cnt = ARRAY_SIZE(insns); + struct bpf_object *obj; + int err; if (object) { - assert(!load_bpf_file((char *)object)); - return prog_fd[0]; + obj = bpf_object__open_file(object, NULL); + assert(!libbpf_get_error(obj)); + err = bpf_object__load(obj); + assert(!err); + return bpf_program__fd(bpf_object__next_program(obj, NULL)); } else { - return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, - insns, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); + + return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + insns, insns_cnt, &opts); } } @@ -72,7 +80,8 @@ static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, int fd, ret; if (flags & BPF_F_PIN) { - fd = bpf_map_create(); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(uint32_t), + sizeof(uint32_t), 1024, NULL); printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); assert(fd > 0); diff --git a/samples/bpf/gnu/stubs.h b/samples/bpf/gnu/stubs.h new file mode 100644 index 000000000000..1c638d9dce1a --- /dev/null +++ b/samples/bpf/gnu/stubs.h @@ -0,0 +1 @@ +/* dummy .h to trick /usr/include/features.h to work with 'clang --target=bpf' */ diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h new file mode 100644 index 000000000000..38255812e376 --- /dev/null +++ b/samples/bpf/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000000..fc88d4dbdf48 --- /dev/null +++ b/samples/bpf/hbm.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Example program for Host Bandwidth Management + * + * This program loads a cgroup skb BPF program to enforce cgroup output + * (egress) or input (ingress) bandwidth limits. + * + * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] + * Where: + * -d Print BPF trace debug buffer + * -l Also limit flows doing loopback + * -n <#> To create cgroup \"/hbm#\" and attach prog + * Default is /hbm1 + * --no_cn Do not return cn notifications + * -r <rate> Rate limit in Mbps + * -s Get HBM stats (marked, dropped, etc.) + * -t <time> Exit after specified seconds (default is 0) + * -w Work conserving flag. cgroup can increase its bandwidth + * beyond the rate limit specified while there is available + * bandwidth. Current implementation assumes there is only + * NIC (eth0), but can be extended to support multiple NICs. + * Currently only supported for egress. + * -h Print this info + * prog BPF program file name. Name defaults to hbm_out_kern.o + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <sys/time.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> +#include <linux/compiler.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> +#include <getopt.h> + +#include "cgroup_helpers.h" +#include "hbm.h" +#include "bpf_util.h" +#include <bpf/libbpf.h> + +bool outFlag = true; +int minRate = 1000; /* cgroup rate limit in Mbps */ +int rate = 1000; /* can grow if rate conserving is enabled */ +int dur = 1; +bool stats_flag; +bool loopback_flag; +bool debugFlag; +bool work_conserving_flag; +bool no_cn_flag; +bool edt_flag; + +static void Usage(void); +static void read_trace_pipe2(void); +static void do_error(char *msg, bool errno_flag); + +#define TRACEFS "/sys/kernel/tracing/" + +static struct bpf_program *bpf_prog; +static struct bpf_object *obj; +static int queue_stats_fd; + +static void read_trace_pipe2(void) +{ + int trace_fd; + FILE *outf; + char *outFname = "hbm_out.log"; + + trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) { + printf("Error opening trace_pipe\n"); + return; + } + +// Future support of ingress +// if (!outFlag) +// outFname = "hbm_in.log"; + outf = fopen(outFname, "w"); + + if (outf == NULL) + printf("Error creating %s\n", outFname); + + while (1) { + static char buf[4097]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + if (outf != NULL) { + fprintf(outf, "%s\n", buf); + fflush(outf); + } + } + } +} + +static void do_error(char *msg, bool errno_flag) +{ + if (errno_flag) + printf("ERROR: %s, errno: %d\n", msg, errno); + else + printf("ERROR: %s\n", msg); + exit(1); +} + +static int prog_load(char *prog) +{ + struct bpf_program *pos; + const char *sec_name; + + obj = bpf_object__open_file(prog, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + return 1; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto err; + } + + bpf_object__for_each_program(pos, obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) { + bpf_prog = pos; + break; + } + } + if (!bpf_prog) { + printf("ERROR: finding a prog in obj file failed\n"); + goto err; + } + + queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); + if (queue_stats_fd < 0) { + printf("ERROR: finding a map in obj file failed\n"); + goto err; + } + + return 0; + +err: + bpf_object__close(obj); + return 1; +} + +static int run_bpf_prog(char *prog, int cg_id) +{ + struct hbm_queue_stats qstats = {0}; + char cg_dir[100], cg_pin_path[100]; + struct bpf_link *link = NULL; + int key = 0; + int cg1 = 0; + int rc = 0; + + sprintf(cg_dir, "/hbm%d", cg_id); + rc = prog_load(prog); + if (rc != 0) + return rc; + + if (setup_cgroup_environment()) { + printf("ERROR: setting cgroup environment\n"); + goto err; + } + cg1 = create_and_get_cgroup(cg_dir); + if (!cg1) { + printf("ERROR: create_and_get_cgroup\n"); + goto err; + } + if (join_cgroup(cg_dir)) { + printf("ERROR: join_cgroup\n"); + goto err; + } + + qstats.rate = rate; + qstats.stats = stats_flag ? 1 : 0; + qstats.loopback = loopback_flag ? 1 : 0; + qstats.no_cn = no_cn_flag ? 1 : 0; + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { + printf("ERROR: Could not update map element\n"); + goto err; + } + + if (!outFlag) + bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); + + link = bpf_program__attach_cgroup(bpf_prog, cg1); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); + goto err; + } + + sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); + rc = bpf_link__pin(link, cg_pin_path); + if (rc < 0) { + printf("ERROR: bpf_link__pin failed: %d\n", rc); + goto err; + } + + if (work_conserving_flag) { + struct timeval t0, t_last, t_new; + FILE *fin; + unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; + signed long long last_cg_tx_bytes, new_cg_tx_bytes; + signed long long delta_time, delta_bytes, delta_rate; + int delta_ms; +#define DELTA_RATE_CHECK 10000 /* in us */ +#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ + + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); + if (gettimeofday(&t0, NULL) < 0) + do_error("gettimeofday failed", true); + t_last = t0; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); + if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + last_cg_tx_bytes = qstats.bytes_total; + while (true) { + usleep(DELTA_RATE_CHECK); + if (gettimeofday(&t_new, NULL) < 0) + do_error("gettimeofday failed", true); + delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + + (t_new.tv_usec - t0.tv_usec)/1000; + if (delta_ms > dur * 1000) + break; + delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + + (t_new.tv_usec - t_last.tv_usec); + if (delta_time == 0) + continue; + t_last = t_new; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", + "r"); + if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + printf(" new_eth_tx_bytes:%llu\n", + new_eth_tx_bytes); + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); + new_cg_tx_bytes = qstats.bytes_total; + delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; + last_eth_tx_bytes = new_eth_tx_bytes; + delta_rate = (delta_bytes * 8000000) / delta_time; + printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", + delta_ms, delta_rate/1000000000.0, + rate/1000.0); + if (delta_rate < RATE_THRESHOLD) { + /* can increase cgroup rate limit, but first + * check if we are using the current limit. + * Currently increasing by 6.25%, unknown + * if that is the optimal rate. + */ + int rate_diff100; + + delta_bytes = new_cg_tx_bytes - + last_cg_tx_bytes; + last_cg_tx_bytes = new_cg_tx_bytes; + delta_rate = (delta_bytes * 8000000) / + delta_time; + printf(" rate:%.3fGbps", + delta_rate/1000000000.0); + rate_diff100 = (((long long)rate)*1000000 - + delta_rate) * 100 / + (((long long) rate) * 1000000); + printf(" rdiff:%d", rate_diff100); + if (rate_diff100 <= 3) { + rate += (rate >> 4); + if (rate > RATE_THRESHOLD / 1000000) + rate = RATE_THRESHOLD / 1000000; + qstats.rate = rate; + printf(" INC\n"); + } else { + printf("\n"); + } + } else { + /* Need to decrease cgroup rate limit. + * Currently decreasing by 12.5%, unknown + * if that is optimal + */ + printf(" DEC\n"); + rate -= (rate >> 3); + if (rate < minRate) + rate = minRate; + qstats.rate = rate; + } + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) + do_error("update map element fails", false); + } + } else { + sleep(dur); + } + // Get stats! + if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { + char fname[100]; + FILE *fout; + + if (!outFlag) + sprintf(fname, "hbm.%d.in", cg_id); + else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "ERROR: Could not lookup queue_stats\n"); + fclose(fout); + } else if (stats_flag && qstats.lastPacketTime > + qstats.firstPacketTime) { + long long delta_us = (qstats.lastPacketTime - + qstats.firstPacketTime)/1000; + unsigned int rate_mbps = ((qstats.bytes_total - + qstats.bytes_dropped) * 8 / + delta_us); + double percent_pkts, percent_bytes; + char fname[100]; + FILE *fout; + int k; + static const char *returnValNames[] = { + "DROP_PKT", + "ALLOW_PKT", + "DROP_PKT_CWR", + "ALLOW_PKT_CWR" + }; +#define RET_VAL_COUNT 4 + +// Future support of ingress +// if (!outFlag) +// sprintf(fname, "hbm.%d.in", cg_id); +// else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "rate_mbps:%d\n", rate_mbps); + fprintf(fout, "duration:%.1f secs\n", + (qstats.lastPacketTime - qstats.firstPacketTime) / + 1000000000.0); + fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); + fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / + 1000000)); + fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); + fprintf(fout, "bytes_dropped_MB:%d\n", + (int)(qstats.bytes_dropped / + 1000000)); + // Marked Pkts and Bytes + percent_pkts = (qstats.pkts_marked * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_marked * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); + + // Dropped Pkts and Bytes + percent_pkts = (qstats.pkts_dropped * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_dropped * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + + // ECN CE markings + percent_pkts = (qstats.pkts_ecn_ce * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, + (int)qstats.pkts_ecn_ce); + + // Average cwnd + fprintf(fout, "avg cwnd:%d\n", + (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); + // Average rtt + fprintf(fout, "avg rtt:%d\n", + (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); + // Average credit + if (edt_flag) + fprintf(fout, "avg credit_ms:%.03f\n", + (qstats.sum_credit / + (qstats.pkts_total + 1.0)) / 1000000.0); + else + fprintf(fout, "avg credit:%d\n", + (int)(qstats.sum_credit / + (1500 * ((int)qstats.pkts_total ) + 1))); + + // Return values stats + for (k = 0; k < RET_VAL_COUNT; k++) { + percent_pkts = (qstats.returnValCount[k] * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], + percent_pkts, (int)qstats.returnValCount[k]); + } + fclose(fout); + } + + if (debugFlag) + read_trace_pipe2(); + goto cleanup; + +err: + rc = 1; + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + + if (cg1 != -1) + close(cg1); + + if (rc != 0) + cleanup_cgroup_environment(); + return rc; +} + +static void Usage(void) +{ + printf("This program loads a cgroup skb BPF program to enforce\n" + "cgroup output (egress) bandwidth limits.\n\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" + " [-s] [-t <secs>] [-w] [-h] [prog]\n" + " Where:\n" + " -o indicates egress direction (default)\n" + " -d print BPF trace debug buffer\n" + " --edt use fq's Earliest Departure Time\n" + " -l also limit flows using loopback\n" + " -n <#> to create cgroup \"/hbm#\" and attach prog\n" + " Default is /hbm1\n" + " --no_cn disable CN notifications\n" + " -r <rate> Rate in Mbps\n" + " -s Update HBM stats\n" + " -t <time> Exit after specified seconds (default is 0)\n" + " -w Work conserving flag. cgroup can increase\n" + " bandwidth beyond the rate limit specified\n" + " while there is available bandwidth. Current\n" + " implementation assumes there is only eth0\n" + " but can be extended to support multiple NICs\n" + " -h print this info\n" + " prog BPF program file name. Name defaults to\n" + " hbm_out_kern.o\n"); +} + +int main(int argc, char **argv) +{ + char *prog = "hbm_out_kern.o"; + int k; + int cg_id = 1; + char *optstring = "iodln:r:st:wh"; + struct option loptions[] = { + {"no_cn", 0, NULL, 1}, + {"edt", 0, NULL, 2}, + {NULL, 0, NULL, 0} + }; + + while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { + switch (k) { + case 1: + no_cn_flag = true; + break; + case 2: + prog = "hbm_edt_kern.o"; + edt_flag = true; + break; + case'o': + break; + case 'd': + debugFlag = true; + break; + case 'l': + loopback_flag = true; + break; + case 'n': + cg_id = atoi(optarg); + break; + case 'r': + minRate = atoi(optarg) * 1.024; + rate = minRate; + break; + case 's': + stats_flag = true; + break; + case 't': + dur = atoi(optarg); + break; + case 'w': + work_conserving_flag = true; + break; + case '?': + if (optopt == 'n' || optopt == 'r' || optopt == 't') + fprintf(stderr, + "Option -%c requires an argument.\n\n", + optopt); + case 'h': + default: + Usage(); + return 0; + } + } + + if (optind < argc) + prog = argv[optind]; + printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); + + /* Use libbpf 1.0 API mode */ + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + return run_bpf_prog(prog, cg_id); +} diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..f0963ed6a562 --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1, /* also limit flows using loopback */ + no_cn:1; /* do not use cn flags */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; + unsigned long long pkts_ecn_ce; + unsigned long long returnValCount[4]; + unsigned long long sum_cwnd; + unsigned long long sum_rtt; + unsigned long long sum_cwnd_cnt; + long long sum_credit; +}; diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c new file mode 100644 index 000000000000..6294f1d716c0 --- /dev/null +++ b/samples/bpf/hbm_edt_kern.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * a user program, such as hbm, can access. + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + long long delta = 0, delta_send; + unsigned long long curtime, sendtime; + struct hbm_queue_stats *qsp = NULL; + unsigned int queue_index = 0; + bool congestion_flag = false; + bool ecn_ce_flag = false; + struct hbm_pkt_info pkti = {}; + struct hbm_vqueue *qdp; + bool drop_flag = false; + bool cwr_flag = false; + int len = skb->len; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + + // Check if we should ignore loopback traffic + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + if (qdp->lasttime == 0) + hbm_init_edt_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + delta = qdp->lasttime - curtime; + // bound bursts to 100us + if (delta < -BURST_SIZE_NS) { + // negative delta is a credit that allows bursts + qdp->lasttime = curtime - BURST_SIZE_NS; + delta = -BURST_SIZE_NS; + } + sendtime = qdp->lasttime; + delta_send = BYTES_TO_NS(len, qdp->rate); + __sync_add_and_fetch(&(qdp->lasttime), delta_send); + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Set EDT of packet + skb->tstamp = sendtime; + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) + qdp->rate = qsp->rate * 128; + + // Set flags (drop, congestion, cwr) + // last packet will be sent in the future, bound latency + if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && + len > LARGE_PKT_THRESH)) { + drop_flag = true; + if (pkti.is_tcp && pkti.ecn == 0) + cwr_flag = true; + } else if (delta > MARK_THRESH_NS) { + if (pkti.is_tcp) + congestion_flag = true; + else + drop_flag = true; + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (delta >= MARK_THRESH_NS + + (rand % MARK_REGION_SIZE_NS)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + congestion_flag = false; + } + } + } + + if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { + drop_flag = false; + cwr_flag = true; + congestion_flag = false; + } + + if (qsp != NULL && qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, (int) delta); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->lasttime), -delta_send); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= CWR; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..1752a46a2b05 --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include <net/inet_ecn.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 +#define CWR 2 + +#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging +#undef bpf_printk +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// Time base accounting for fq's EDT +#define BURST_SIZE_NS 100000 // 100us +#define MARK_THRESH_NS 50000 // 50us +#define DROP_THRESH_NS 500000 // 500us +// Reserve 20us of queuing for small packets (less than 120 bytes) +#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000) +#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate)) + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, struct hbm_vqueue); +} queue_state SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct hbm_queue_stats); +} queue_stats SEC(".maps"); + +struct hbm_pkt_info { + int cwnd; + int rtt; + int packets_out; + bool is_ip; + bool is_tcp; + short ecn; +}; + +static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) +{ + struct bpf_sock *sk; + struct bpf_tcp_sock *tp; + + sk = skb->sk; + if (sk) { + sk = bpf_sk_fullsock(sk); + if (sk) { + if (sk->protocol == IPPROTO_TCP) { + tp = bpf_tcp_sock(sk); + if (tp) { + pkti->cwnd = tp->snd_cwnd; + pkti->rtt = tp->srtt_us >> 3; + pkti->packets_out = tp->packets_out; + return 0; + } + } + } + } + pkti->cwnd = 0; + pkti->rtt = 0; + pkti->packets_out = 0; + return 1; +} + +static void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + pkti->cwnd = 0; + pkti->rtt = 0; + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } + if (pkti->is_tcp) + get_tcp_info(skb, pkti); +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp, + int rate) +{ + unsigned long long curtime; + + curtime = bpf_ktime_get_ns(); + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst + qdp->credit = 0; // not used + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag, + bool cwr_flag, + bool ecn_ce_flag, + struct hbm_pkt_info *pkti, + int credit) +{ + int rv = ALLOW_PKT; + + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + if (ecn_ce_flag) + __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); + if (pkti->cwnd) { + __sync_add_and_fetch(&(qsp->sum_cwnd), + pkti->cwnd); + __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); + } + if (pkti->rtt) + __sync_add_and_fetch(&(qsp->sum_rtt), + pkti->rtt); + __sync_add_and_fetch(&(qsp->sum_credit), credit); + + if (drop_flag) + rv = DROP_PKT; + if (cwr_flag) + rv |= 2; + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[0]), + 1); + else if (rv == ALLOW_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[1]), + 1); + else if (rv == 2) + __sync_add_and_fetch(&(qsp->returnValCount[2]), + 1); + else if (rv == 3) + __sync_add_and_fetch(&(qsp->returnValCount[3]), + 1); + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..829934bd43cb --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, new_credit; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + bool ecn_ce_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); + if (new_credit > MAX_CREDIT) + credit = MAX_CREDIT; + else + credit = new_credit; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop packet + drop_flag = true; + if (pkti.ecn) + congestion_flag = true; + else if (pkti.is_tcp) + cwr_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn || pkti.is_tcp) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (-credit >= MARK_THRESH + + (rand % MARK_REGION_SIZE)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (qsp != NULL) + if (qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, credit); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->credit), len); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= 2; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c new file mode 100644 index 000000000000..f07474c72525 --- /dev/null +++ b/samples/bpf/ibumad_kern.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +/* + * ibumad BPF sample kernel side + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Copyright(c) 2018 Ira Weiny, Intel Corporation + */ + +#define KBUILD_MODNAME "ibumad_count_pkts_by_class" +#include <uapi/linux/bpf.h> + +#include <bpf/bpf_helpers.h> + + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads read */ + __uint(max_entries, 256); /* Room for all Classes */ +} read_count SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads written */ + __uint(max_entries, 256); /* Room for all Classes */ +} write_count SEC(".maps"); + +#undef DEBUG +#ifndef DEBUG +#undef bpf_printk +#define bpf_printk(fmt, ...) +#endif + +/* Taken from the current format defined in + * include/trace/events/ib_umad.h + * and + * /sys/kernel/tracing/events/ib_umad/ib_umad_read/format + * /sys/kernel/tracing/events/ib_umad/ib_umad_write/format + */ +struct ib_umad_rw_args { + u64 pad; + u8 port_num; + u8 sl; + u8 path_bits; + u8 grh_present; + u32 id; + u32 status; + u32 timeout_ms; + u32 retires; + u32 length; + u32 qpn; + u32 qkey; + u8 gid_index; + u8 hop_limit; + u16 lid; + u16 attr_id; + u16 pkey_index; + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + u32 flow_label; + u16 mad_status; + u16 class_specific; + u32 attr_mod; + u64 tid; + u8 gid[16]; + u32 dev_index; + u8 traffic_class; +}; + +SEC("tracepoint/ib_umad/ib_umad_read_recv") +int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad read recv : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) { + bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} +SEC("tracepoint/ib_umad/ib_umad_read_send") +int on_ib_umad_read_send(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad read send : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) { + bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&read_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} +SEC("tracepoint/ib_umad/ib_umad_write") +int on_ib_umad_write(struct ib_umad_rw_args *ctx) +{ + u64 zero = 0, *val; + u8 class = ctx->mgmt_class; + + bpf_printk("ib_umad write : class 0x%x\n", class); + + val = bpf_map_lookup_elem(&write_count, &class); + if (!val) { + bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST); + val = bpf_map_lookup_elem(&write_count, &class); + if (!val) + return 0; + } + + (*val) += 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c new file mode 100644 index 000000000000..d074c978aac7 --- /dev/null +++ b/samples/bpf/ibumad_user.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +/* + * ibumad BPF sample user side + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Copyright(c) 2018 Ira Weiny, Intel Corporation + */ + +#include <linux/bpf.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <limits.h> + +#include <getopt.h> +#include <net/if.h> + +#include <bpf/bpf.h> +#include "bpf_util.h" +#include <bpf/libbpf.h> + +static struct bpf_link *tp_links[3]; +static struct bpf_object *obj; +static int map_fd[2]; +static int tp_cnt; + +static void dump_counts(int fd) +{ + __u32 key; + __u64 value; + + for (key = 0; key < 256; key++) { + if (bpf_map_lookup_elem(fd, &key, &value)) { + printf("failed to read key %u\n", key); + continue; + } + if (value) + printf("0x%02x : %llu\n", key, value); + } +} + +static void dump_all_counts(void) +{ + printf("Read 'Class : count'\n"); + dump_counts(map_fd[0]); + printf("Write 'Class : count'\n"); + dump_counts(map_fd[1]); +} + +static void dump_exit(int sig) +{ + dump_all_counts(); + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + exit(0); +} + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h'}, + {"delay", required_argument, NULL, 'd'}, +}; + +static void usage(char *cmd) +{ + printf("eBPF test program to count packets from various IP addresses\n" + "Usage: %s <options>\n" + " --help, -h this menu\n" + " --delay, -d <delay> wait <delay> sec between prints [1 - 1000000]\n" + , cmd + ); +} + +int main(int argc, char **argv) +{ + struct bpf_program *prog; + unsigned long delay = 5; + char filename[256]; + int longindex = 0; + int opt, err = -1; + + while ((opt = getopt_long(argc, argv, "hd:rSw", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + delay = strtoul(optarg, NULL, 0); + if (delay == ULONG_MAX || delay < 0 || + delay > 1000000) { + fprintf(stderr, "ERROR: invalid delay : %s\n", + optarg); + usage(argv[0]); + return 1; + } + break; + default: + case 'h': + usage(argv[0]); + return 1; + } + } + + /* Do one final dump when exiting */ + signal(SIGINT, dump_exit); + signal(SIGTERM, dump_exit); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "read_count"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "write_count"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; + } + + while (1) { + sleep(delay); + dump_all_counts(); + } + err = 0; + +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + return err; +} diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c index 18fa088473cd..4adfcbbe6ef4 100644 --- a/samples/bpf/lathist_kern.c +++ b/samples/bpf/lathist_kern.c @@ -8,7 +8,7 @@ #include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define MAX_ENTRIES 20 #define MAX_CPU 4 @@ -18,12 +18,12 @@ * trace_preempt_[on|off] tracepoints hooks is not supported. */ -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = MAX_CPU, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, u64); + __uint(max_entries, MAX_CPU); +} my_map SEC(".maps"); SEC("kprobe/trace_preempt_off") int bpf_prog1(struct pt_regs *ctx) @@ -61,12 +61,12 @@ static unsigned int log2l(unsigned long v) return log2(v); } -struct bpf_map_def SEC("maps") my_lat = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(long), - .max_entries = MAX_CPU * MAX_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, long); + __uint(max_entries, MAX_CPU * MAX_ENTRIES); +} my_lat SEC(".maps"); SEC("kprobe/trace_preempt_on") int bpf_prog2(struct pt_regs *ctx) diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c index 6477bad5b4e2..7d8ff2418303 100644 --- a/samples/bpf/lathist_user.c +++ b/samples/bpf/lathist_user.c @@ -1,17 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2015 BMW Car IT GmbH - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/libbpf.h> +#include <bpf/bpf.h> #define MAX_ENTRIES 20 #define MAX_CPU 4 @@ -84,20 +80,51 @@ static void get_data(int fd) int main(int argc, char **argv) { + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int map_fd, i = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } while (1) { - get_data(map_fd[1]); + get_data(map_fd); print_hist(); sleep(5); } +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c deleted file mode 100644 index e5da6cf71a3e..000000000000 --- a/samples/bpf/load_sock_ops.c +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <linux/bpf.h> -#include "libbpf.h" -#include "bpf_load.h" -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <linux/unistd.h> - -static void usage(char *pname) -{ - printf("USAGE:\n %s [-l] <cg-path> <prog filename>\n", pname); - printf("\tLoad and attach a sock_ops program to the specified " - "cgroup\n"); - printf("\tIf \"-l\" is used, the program will continue to run\n"); - printf("\tprinting the BPF log buffer\n"); - printf("\tIf the specified filename does not end in \".o\", it\n"); - printf("\tappends \"_kern.o\" to the name\n"); - printf("\n"); - printf(" %s -r <cg-path>\n", pname); - printf("\tDetaches the currently attached sock_ops program\n"); - printf("\tfrom the specified cgroup\n"); - printf("\n"); - exit(1); -} - -int main(int argc, char **argv) -{ - int logFlag = 0; - int error = 0; - char *cg_path; - char fn[500]; - char *prog; - int cg_fd; - - if (argc < 3) - usage(argv[0]); - - if (!strcmp(argv[1], "-r")) { - cg_path = argv[2]; - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); - if (error) { - printf("ERROR: bpf_prog_detach: %d (%s)\n", - error, strerror(errno)); - return 2; - } - return 0; - } else if (!strcmp(argv[1], "-h")) { - usage(argv[0]); - } else if (!strcmp(argv[1], "-l")) { - logFlag = 1; - if (argc < 4) - usage(argv[0]); - } - - prog = argv[argc - 1]; - cg_path = argv[argc - 2]; - if (strlen(prog) > 480) { - fprintf(stderr, "ERROR: program name too long (> 480 chars)\n"); - return 3; - } - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - - if (!strcmp(prog + strlen(prog)-2, ".o")) - strcpy(fn, prog); - else - sprintf(fn, "%s_kern.o", prog); - if (logFlag) - printf("loading bpf file:%s\n", fn); - if (load_bpf_file(fn)) { - printf("ERROR: load_bpf_file failed for: %s\n", fn); - printf("%s", bpf_log_buf); - return 4; - } - if (logFlag) - printf("TCP BPF Loaded %s\n", fn); - - error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0); - if (error) { - printf("ERROR: bpf_prog_attach: %d (%s)\n", - error, strerror(errno)); - return 5; - } else if (logFlag) { - read_trace_pipe(); - } - - return error; -} diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist.bpf.c index df75383280f9..dbab80e813fe 100644 --- a/samples/bpf/lwt_len_hist_kern.c +++ b/samples/bpf/lwt_len_hist.bpf.c @@ -10,36 +10,16 @@ * General Public License for more details. */ -#include <uapi/linux/bpf.h> -#include <uapi/linux/if_ether.h> -#include <uapi/linux/ip.h> -#include <uapi/linux/in.h> -#include "bpf_helpers.h" - -# define printk(fmt, ...) \ - ({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ - }) - -struct bpf_elf_map { - __u32 type; - __u32 size_key; - __u32 size_value; - __u32 max_elem; - __u32 flags; - __u32 id; - __u32 pinning; -}; - -struct bpf_elf_map SEC("maps") lwt_len_hist_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .size_key = sizeof(__u64), - .size_value = sizeof(__u64), - .pinning = 2, - .max_elem = 1024, -}; +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __type(key, u64); + __type(value, u64); + __uint(pinning, LIBBPF_PIN_BY_NAME); + __uint(max_entries, 1024); +} lwt_len_hist_map SEC(".maps"); static unsigned int log2(unsigned int v) { diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh index 7d567744c7fa..381b2c634784 100644..100755 --- a/samples/bpf/lwt_len_hist.sh +++ b/samples/bpf/lwt_len_hist.sh @@ -1,12 +1,15 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 NS1=lwt_ns1 VETH0=tst_lwt1a VETH1=tst_lwt1b - -TRACE_ROOT=/sys/kernel/debug/tracing +BPF_PROG=lwt_len_hist.bpf.o +TRACE_ROOT=/sys/kernel/tracing function cleanup { + # To reset saved histogram, remove pinned map + rm /sys/fs/bpf/tc/globals/lwt_len_hist_map ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null ip link del $VETH0 2> /dev/null ip link del $VETH1 2> /dev/null @@ -27,7 +30,7 @@ ip netns exec $NS1 netserver echo 1 > ${TRACE_ROOT}/tracing_on cp /dev/null ${TRACE_ROOT}/trace -ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0 +ip route add 192.168.253.2/32 encap bpf out obj $BPF_PROG section len_hist dev $VETH0 netperf -H 192.168.253.2 -t TCP_STREAM cat ${TRACE_ROOT}/trace | grep -v '^#' ./lwt_len_hist diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c index ec8f3bbcbef3..430a4b7e353e 100644 --- a/samples/bpf/lwt_len_hist_user.c +++ b/samples/bpf/lwt_len_hist_user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/unistd.h> #include <linux/bpf.h> @@ -8,14 +9,12 @@ #include <errno.h> #include <arpa/inet.h> -#include "libbpf.h" +#include <bpf/bpf.h> #include "bpf_util.h" #define MAX_INDEX 64 #define MAX_STARS 38 -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - static void stars(char *str, long val, long max, int width) { int i; diff --git a/samples/bpf/map_perf_test.bpf.c b/samples/bpf/map_perf_test.bpf.c new file mode 100644 index 000000000000..3cdeba2afe12 --- /dev/null +++ b/samples/bpf/map_perf_test.bpf.c @@ -0,0 +1,297 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "vmlinux.h" +#include <errno.h> +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#define MAX_ENTRIES 1000 +#define MAX_NR_CPUS 1024 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); +} lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_COMMON_LRU); +} nocommon_lru_hash_map SEC(".maps"); + +struct inner_lru { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NUMA_NODE); + __uint(numa_node, 0); +} inner_lru_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_CPUS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_lru); /* use inner_lru as inner map */ +} array_of_lru_hashs SEC(".maps") = { + /* statically initialize the first element */ + .values = { &inner_lru_hash_map }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); +} percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(long)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} percpu_hash_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(long)); + __uint(max_entries, 10000); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_trie_map_alloc SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} array_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, u32); + __type(value, long); + __uint(max_entries, MAX_ENTRIES); +} lru_hash_lookup_map SEC(".maps"); + +SEC("ksyscall/getuid") +int BPF_KSYSCALL(stress_hmap) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) + bpf_map_delete_elem(&hash_map, &key); + } + + return 0; +} + +SEC("ksyscall/geteuid") +int BPF_KSYSCALL(stress_percpu_hmap) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map, &key); + } + return 0; +} + +SEC("ksyscall/getgid") +int BPF_KSYSCALL(stress_hmap_alloc) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&hash_map_alloc, &key); + } + return 0; +} + +SEC("ksyscall/getegid") +int BPF_KSYSCALL(stress_percpu_hmap_alloc) +{ + u32 key = bpf_get_current_pid_tgid(); + long init_val = 1; + long *value; + int i; + + for (i = 0; i < 10; i++) { + bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); + value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); + if (value) + bpf_map_delete_elem(&percpu_hash_map_alloc, &key); + } + return 0; +} +SEC("ksyscall/connect") +int BPF_KSYSCALL(stress_lru_hmap_alloc, int fd, struct sockaddr_in *uservaddr, + int addrlen) +{ + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)uservaddr; + u16 test_case; + long val = 1; + u32 key = 0; + int ret; + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6), + &in6->sin6_addr); + if (ret) + goto done; + + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) + return 0; + + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); + + if (test_case == 0) { + ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + } else if (test_case == 1) { + ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, + BPF_ANY); + } else if (test_case == 2) { + void *nolocal_lru_map; + int cpu = bpf_get_smp_processor_id(); + + nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, + &cpu); + if (!nolocal_lru_map) { + ret = -ENOENT; + goto done; + } + + ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, + BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } + } else { + ret = -EINVAL; + } + +done: + if (ret) + bpf_trace_printk(fmt, sizeof(fmt), ret); + + return 0; +} + +SEC("ksyscall/gettid") +int BPF_KSYSCALL(stress_lpm_trie_map_alloc) +{ + union { + u32 b32[2]; + u8 b8[8]; + } key; + unsigned int i; + + key.b32[0] = 32; + key.b8[4] = 192; + key.b8[5] = 168; + key.b8[6] = 0; + key.b8[7] = 1; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; ++i) + bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); + + return 0; +} + +SEC("ksyscall/getpgid") +int BPF_KSYSCALL(stress_hash_map_lookup) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&hash_map, &key); + + return 0; +} + +SEC("ksyscall/getppid") +int BPF_KSYSCALL(stress_array_map_lookup) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&array_map, &key); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c deleted file mode 100644 index 245165817fbe..000000000000 --- a/samples/bpf/map_perf_test_kern.c +++ /dev/null @@ -1,249 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -#define MAX_ENTRIES 1000 -#define MAX_NR_CPUS 1024 - -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -struct bpf_map_def SEC("maps") lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 10000, -}; - -struct bpf_map_def SEC("maps") nocommon_lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 10000, - .map_flags = BPF_F_NO_COMMON_LRU, -}; - -struct bpf_map_def SEC("maps") inner_lru_hash_map = { - .type = BPF_MAP_TYPE_LRU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -struct bpf_map_def SEC("maps") array_of_lru_hashs = { - .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, - .key_size = sizeof(u32), - .max_entries = MAX_NR_CPUS, -}; - -struct bpf_map_def SEC("maps") percpu_hash_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -struct bpf_map_def SEC("maps") hash_map_alloc = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def SEC("maps") percpu_hash_map_alloc = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def SEC("maps") lpm_trie_map_alloc = { - .type = BPF_MAP_TYPE_LPM_TRIE, - .key_size = 8, - .value_size = sizeof(long), - .max_entries = 10000, - .map_flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_map_def SEC("maps") array_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = MAX_ENTRIES, -}; - -SEC("kprobe/sys_getuid") -int stress_hmap(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - - bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map, &key); - if (value) - bpf_map_delete_elem(&hash_map, &key); - - return 0; -} - -SEC("kprobe/sys_geteuid") -int stress_percpu_hmap(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - - bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map, &key); - return 0; -} - -SEC("kprobe/sys_getgid") -int stress_hmap_alloc(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - - bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&hash_map_alloc, &key); - return 0; -} - -SEC("kprobe/sys_getegid") -int stress_percpu_hmap_alloc(struct pt_regs *ctx) -{ - u32 key = bpf_get_current_pid_tgid(); - long init_val = 1; - long *value; - - bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); - value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); - if (value) - bpf_map_delete_elem(&percpu_hash_map_alloc, &key); - return 0; -} - -SEC("kprobe/sys_connect") -int stress_lru_hmap_alloc(struct pt_regs *ctx) -{ - struct sockaddr_in6 *in6; - u16 test_case, dst6[8]; - int addrlen, ret; - char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n"; - long val = 1; - u32 key = bpf_get_prandom_u32(); - - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); - addrlen = (int)PT_REGS_PARM3(ctx); - - if (addrlen != sizeof(*in6)) - return 0; - - ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); - if (ret) - goto done; - - if (dst6[0] != 0xdead || dst6[1] != 0xbeef) - return 0; - - test_case = dst6[7]; - - if (test_case == 0) { - ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); - } else if (test_case == 1) { - ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, - BPF_ANY); - } else if (test_case == 2) { - void *nolocal_lru_map; - int cpu = bpf_get_smp_processor_id(); - - nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, - &cpu); - if (!nolocal_lru_map) { - ret = -ENOENT; - goto done; - } - - ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, - BPF_ANY); - } else { - ret = -EINVAL; - } - -done: - if (ret) - bpf_trace_printk(fmt, sizeof(fmt), ret); - - return 0; -} - -SEC("kprobe/sys_gettid") -int stress_lpm_trie_map_alloc(struct pt_regs *ctx) -{ - union { - u32 b32[2]; - u8 b8[8]; - } key; - unsigned int i; - - key.b32[0] = 32; - key.b8[4] = 192; - key.b8[5] = 168; - key.b8[6] = 0; - key.b8[7] = 1; - -#pragma clang loop unroll(full) - for (i = 0; i < 32; ++i) - bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); - - return 0; -} - -SEC("kprobe/sys_getpgid") -int stress_hash_map_lookup(struct pt_regs *ctx) -{ - u32 key = 1, i; - long *value; - -#pragma clang loop unroll(full) - for (i = 0; i < 64; ++i) - value = bpf_map_lookup_elem(&hash_map, &key); - - return 0; -} - -SEC("kprobe/sys_getpgrp") -int stress_array_map_lookup(struct pt_regs *ctx) -{ - u32 key = 1, i; - long *value; - -#pragma clang loop unroll(full) - for (i = 0; i < 64; ++i) - value = bpf_map_lookup_elem(&array_map, &key); - - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 1a8894b5ac51..07ff471ed6ae 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #define _GNU_SOURCE #include <sched.h> @@ -14,15 +11,13 @@ #include <sys/wait.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> #include <string.h> #include <time.h> -#include <sys/resource.h> #include <arpa/inet.h> #include <errno.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #define TEST_BIT(t) (1U << (t)) #define MAX_NR_CPUS 1024 @@ -46,6 +41,7 @@ enum test_type { HASH_LOOKUP, ARRAY_LOOKUP, INNER_LRU_HASH_PREALLOC, + LRU_HASH_LOOKUP, NR_TESTS, }; @@ -60,14 +56,23 @@ const char *test_map_names[NR_TESTS] = { [HASH_LOOKUP] = "hash_map", [ARRAY_LOOKUP] = "array_map", [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", + [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", }; +enum map_idx { + array_of_lru_hashs_idx, + hash_map_alloc_idx, + lru_hash_lookup_idx, + NR_IDXES, +}; + +static int map_fd[NR_IDXES]; + static int test_flags = ~0; static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; -static int inner_lru_hash_idx = -1; -static int array_of_lru_hashs_idx = -1; -static uint32_t max_cnt = 1000000; +static int lru_hash_lookup_test_entries = 32; +static uint32_t max_cnt = 10000; static int check_test_flags(enum test_type t) { @@ -86,6 +91,32 @@ static void test_hash_prealloc(int cpu) cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } +static int pre_test_lru_hash_lookup(int tasks) +{ + int fd = map_fd[lru_hash_lookup_idx]; + uint32_t key; + long val = 1; + int ret; + + if (num_map_entries > lru_hash_lookup_test_entries) + lru_hash_lookup_test_entries = num_map_entries; + + /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. + * + * It is fine that the user requests for a map with + * num_map_entries < 32 and some of the later lru hash lookup + * may return not found. For LRU map, we are not interested + * in such small map performance. + */ + for (key = 0; key < lru_hash_lookup_test_entries; key++) { + ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST); + if (ret) + return ret; + } + + return 0; +} + static void do_test_lru(enum test_type test, int cpu) { static int inner_lru_map_fds[MAX_NR_CPUS]; @@ -95,23 +126,33 @@ static void do_test_lru(enum test_type test, int cpu) __u64 start_time; int i, ret; - if (test == INNER_LRU_HASH_PREALLOC) { + if (test == INNER_LRU_HASH_PREALLOC && cpu) { + /* If CPU is not 0, create inner_lru hash map and insert the fd + * value into the array_of_lru_hash map. In case of CPU 0, + * 'inner_lru_hash_map' was statically inserted on the map init + */ int outer_fd = map_fd[array_of_lru_hashs_idx]; + unsigned int mycpu, mynode; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NUMA_NODE, + ); assert(cpu < MAX_NR_CPUS); - if (cpu) { - inner_lru_map_fds[cpu] = - bpf_create_map(BPF_MAP_TYPE_LRU_HASH, - sizeof(uint32_t), sizeof(long), - inner_lru_hash_size, 0); - if (inner_lru_map_fds[cpu] == -1) { - printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", - strerror(errno), errno); - exit(1); - } - } else { - inner_lru_map_fds[cpu] = map_fd[inner_lru_hash_idx]; + ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); + assert(!ret); + + opts.numa_node = mynode; + inner_lru_map_fds[cpu] = + bpf_map_create(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, &opts); + if (inner_lru_map_fds[cpu] == -1) { + printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", + strerror(errno), errno); + exit(1); } ret = bpf_map_update_elem(outer_fd, &cpu, @@ -129,13 +170,17 @@ static void do_test_lru(enum test_type test, int cpu) if (test == LRU_HASH_PREALLOC) { test_name = "lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 0; + in6.sin6_addr.s6_addr16[2] = 0; } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { test_name = "nocommon_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 1; + in6.sin6_addr.s6_addr16[2] = 1; } else if (test == INNER_LRU_HASH_PREALLOC) { test_name = "inner_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 2; + in6.sin6_addr.s6_addr16[2] = 2; + } else if (test == LRU_HASH_LOOKUP) { + test_name = "lru_hash_lookup_perf"; + in6.sin6_addr.s6_addr16[2] = 3; + in6.sin6_addr.s6_addr32[3] = 0; } else { assert(0); } @@ -144,6 +189,11 @@ static void do_test_lru(enum test_type test, int cpu) for (i = 0; i < max_cnt; i++) { ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); assert(ret == -1 && errno == EBADF); + if (in6.sin6_addr.s6_addr32[3] < + lru_hash_lookup_test_entries - 32) + in6.sin6_addr.s6_addr32[3] += 32; + else + in6.sin6_addr.s6_addr32[3] = 0; } printf("%d:%s pre-alloc %lld events per sec\n", cpu, test_name, @@ -165,6 +215,11 @@ static void test_inner_lru_hash_prealloc(int cpu) do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); } +static void test_lru_hash_lookup(int cpu) +{ + do_test_lru(LRU_HASH_LOOKUP, cpu); +} + static void test_percpu_hash_prealloc(int cpu) { __u64 start_time; @@ -232,11 +287,16 @@ static void test_array_lookup(int cpu) start_time = time_get_ns(); for (i = 0; i < max_cnt; i++) - syscall(__NR_getpgrp, 0); + syscall(__NR_getppid, 0); printf("%d:array_lookup %lld lookups per sec\n", cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } +typedef int (*pre_test_func)(int tasks); +const pre_test_func pre_test_funcs[] = { + [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup, +}; + typedef void (*test_func)(int cpu); const test_func test_funcs[] = { [HASH_PREALLOC] = test_hash_prealloc, @@ -249,8 +309,25 @@ const test_func test_funcs[] = { [HASH_LOOKUP] = test_hash_lookup, [ARRAY_LOOKUP] = test_array_lookup, [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, + [LRU_HASH_LOOKUP] = test_lru_hash_lookup, }; +static int pre_test(int tasks) +{ + int i; + + for (i = 0; i < NR_TESTS; i++) { + if (pre_test_funcs[i] && check_test_flags(i)) { + int ret = pre_test_funcs[i](tasks); + + if (ret) + return ret; + } + } + + return 0; +} + static void loop(int cpu) { cpu_set_t cpuset; @@ -271,6 +348,8 @@ static void run_perf_test(int tasks) pid_t pid[tasks]; int i; + assert(!pre_test(tasks)); + for (i = 0; i < tasks; i++) { pid[i] = fork(); if (pid[i] == 0) { @@ -291,7 +370,7 @@ static void run_perf_test(int tasks) static void fill_lpm_trie(void) { - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; unsigned long value = 0; unsigned int i; int r; @@ -305,7 +384,8 @@ static void fill_lpm_trie(void) key->data[1] = rand() & 0xff; key->data[2] = rand() & 0xff; key->data[3] = rand() & 0xff; - r = bpf_map_update_elem(map_fd[6], key, &value, 0); + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], + key, &value, 0); assert(!r); } @@ -316,56 +396,46 @@ static void fill_lpm_trie(void) key->data[3] = 1; value = 128; - r = bpf_map_update_elem(map_fd[6], key, &value, 0); + r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0); assert(!r); } -static void fixup_map(struct bpf_map_data *map, int idx) +static void fixup_map(struct bpf_object *obj) { + struct bpf_map *map; int i; - if (!strcmp("inner_lru_hash_map", map->name)) { - inner_lru_hash_idx = idx; - inner_lru_hash_size = map->def.max_entries; - } + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); - if (!strcmp("array_of_lru_hashs", map->name)) { - if (inner_lru_hash_idx == -1) { - printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n"); - exit(1); + /* Only change the max_entries for the enabled test(s) */ + for (i = 0; i < NR_TESTS; i++) { + if (!strcmp(test_map_names[i], name) && + (check_test_flags(i))) { + bpf_map__set_max_entries(map, num_map_entries); + continue; + } } - map->def.inner_map_idx = inner_lru_hash_idx; - array_of_lru_hashs_idx = idx; } - if (num_map_entries <= 0) - return; - inner_lru_hash_size = num_map_entries; - - /* Only change the max_entries for the enabled test(s) */ - for (i = 0; i < NR_TESTS; i++) { - if (!strcmp(test_map_names[i], map->name) && - (check_test_flags(i))) { - map->def.max_entries = num_map_entries; - } - } } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link *links[8]; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_map *map; char filename[256]; - int num_cpu = 8; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + int i = 0; if (argc > 1) test_flags = atoi(argv[1]) ? : test_flags; if (argc > 2) - num_cpu = atoi(argv[2]) ? : num_cpu; + nr_cpus = atoi(argv[2]) ? : nr_cpus; if (argc > 3) num_map_entries = atoi(argv[3]); @@ -373,14 +443,61 @@ int main(int argc, char **argv) if (argc > 4) max_cnt = atoi(argv[4]); - if (load_bpf_file_fixup_map(filename, fixup_map)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map"); + if (libbpf_get_error(map)) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + inner_lru_hash_size = bpf_map__max_entries(map); + if (!inner_lru_hash_size) { + fprintf(stderr, "ERROR: failed to get map attribute\n"); + goto cleanup; + } + + /* resize BPF map prior to loading */ + if (num_map_entries > 0) + fixup_map(obj); + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } fill_lpm_trie(); - run_perf_test(num_cpu); + run_perf_test(nr_cpus); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/net_shared.h b/samples/bpf/net_shared.h new file mode 100644 index 000000000000..88cc52461c98 --- /dev/null +++ b/samples/bpf/net_shared.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _NET_SHARED_H +#define _NET_SHARED_H + +#define AF_INET 2 +#define AF_INET6 10 + +#define ETH_ALEN 6 +#define ETH_P_802_3_MIN 0x0600 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD +#define ETH_P_ARP 0x0806 +#define IPPROTO_ICMPV6 58 + +#define TC_ACT_OK 0 +#define TC_ACT_SHOT 2 + +#define IFNAMSIZ 16 + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define bpf_ntohs(x) __builtin_bswap16(x) +#define bpf_htons(x) __builtin_bswap16(x) +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define bpf_ntohs(x) (x) +#define bpf_htons(x) (x) +#else +# error "Endianness detection needs to be set up for your compiler?!" +#endif + +#endif diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime.bpf.c index e7d9a0a3d45b..4a65ba76c1b1 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime.bpf.c @@ -4,16 +4,18 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" -#include <uapi/linux/ptrace.h> -#include <uapi/linux/perf_event.h> +#include "vmlinux.h" #include <linux/version.h> -#include <linux/sched.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> -#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif #define MINBLOCK_US 1 +#define MAX_ENTRIES 10000 struct key_t { char waker[TASK_COMM_LEN]; @@ -22,49 +24,47 @@ struct key_t { u32 tret; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") start = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} start SEC(".maps"); struct wokeby_t { char name[TASK_COMM_LEN]; u32 ret; }; -struct bpf_map_def SEC("maps") wokeby = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(struct wokeby_t), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct wokeby_t); + __uint(max_entries, MAX_ENTRIES); +} wokeby SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, MAX_ENTRIES); +} stackmap SEC(".maps"); #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) SEC("kprobe/try_to_wake_up") int waker(struct pt_regs *ctx) { - struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx); + u32 pid = BPF_CORE_READ(p, pid); struct wokeby_t woke; - u32 pid; - - pid = _(p->pid); bpf_get_current_comm(&woke.name, sizeof(woke.name)); woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); @@ -103,29 +103,19 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta) } #if 1 -/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ -struct sched_switch_args { - unsigned long long pad; - char prev_comm[16]; - int prev_pid; - int prev_prio; - long long prev_state; - char next_comm[16]; - int next_pid; - int next_prio; -}; +/* taken from /sys/kernel/tracing/events/sched/sched_switch/format */ SEC("tracepoint/sched/sched_switch") -int oncpu(struct sched_switch_args *ctx) +int oncpu(struct trace_event_raw_sched_switch *ctx) { /* record previous thread sleep time */ u32 pid = ctx->prev_pid; #else -SEC("kprobe/finish_task_switch") +SEC("kprobe.multi/finish_task_switch*") int oncpu(struct pt_regs *ctx) { - struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx); /* record previous thread sleep time */ - u32 pid = _(p->pid); + u32 pid = BPF_CORE_READ(p, pid); #endif u64 delta, ts, *tsp; diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index 512f87a5fd20..5557b5393642 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -1,25 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> -#include <string.h> #include <linux/perf_event.h> #include <errno.h> -#include <assert.h> #include <stdbool.h> -#include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "trace_helpers.h" #define PRINT_RAW_ADDR 0 +/* counts, stackmap */ +static int map_fd[2]; + static void print_ksym(__u64 addr) { struct ksym *sym; @@ -27,6 +24,11 @@ static void print_ksym(__u64 addr) if (!addr) return; sym = ksym_search(addr); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + return; + } + if (PRINT_RAW_ADDR) printf("%s/%llx;", sym->name, addr); else @@ -49,14 +51,14 @@ static void print_stack(struct key_t *key, __u64 count) int i; printf("%s;", key->target); - if (bpf_map_lookup_elem(map_fd[3], &key->tret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) { printf("---;"); } else { for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) print_ksym(ip[i]); } printf("-;"); - if (bpf_map_lookup_elem(map_fd[3], &key->wret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) { printf("---;"); } else { for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) @@ -92,24 +94,49 @@ static void int_exit(int sig) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + struct bpf_link *links[2]; + struct bpf_program *prog; + int delay = 1, i = 0; char filename[256]; - int delay = 1; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } if (argc > 1) @@ -117,5 +144,10 @@ int main(int argc, char **argv) sleep(delay); print_stacks(map_fd[0]); +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c index 6db6b21fdc6d..c6f65f90a097 100644 --- a/samples/bpf/parse_ldabs.c +++ b/samples/bpf/parse_ldabs.c @@ -11,7 +11,8 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" #define DEFAULT_PKTGEN_UDP_PORT 9 #define IP_MF 0x2000 diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c index 10af53d33cc2..4a486cb1e0df 100644 --- a/samples/bpf/parse_simple.c +++ b/samples/bpf/parse_simple.c @@ -12,7 +12,7 @@ #include <linux/udp.h> #include <uapi/linux/bpf.h> #include <net/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define DEFAULT_PKTGEN_UDP_PORT 9 diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c index 95c16324760c..d8623846e810 100644 --- a/samples/bpf/parse_varlen.c +++ b/samples/bpf/parse_varlen.c @@ -6,6 +6,7 @@ */ #define KBUILD_MODNAME "foo" #include <linux/if_ether.h> +#include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/in.h> @@ -13,7 +14,7 @@ #include <linux/udp.h> #include <uapi/linux/bpf.h> #include <net/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define DEFAULT_PKTGEN_UDP_PORT 9 #define DEBUG 0 @@ -108,11 +109,6 @@ static int parse_ipv6(void *data, uint64_t nh_off, void *data_end) return 0; } -struct vlan_hdr { - uint16_t h_vlan_TCI; - uint16_t h_vlan_encapsulated_proto; -}; - SEC("varlen") int handle_ingress(struct __sk_buff *skb) { diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh index f898cfa2b1aa..fc6bc0451ab4 100755 --- a/samples/bpf/run_cookie_uid_helper_example.sh +++ b/samples/bpf/run_cookie_uid_helper_example.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 local_dir="$(pwd)" root_dir=$local_dir/../.. mnt_dir=$(mktemp -d --tmp) diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c index ceabf31079cf..a3f8a3998e0a 100644 --- a/samples/bpf/sampleip_kern.c +++ b/samples/bpf/sampleip_kern.c @@ -4,20 +4,20 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/version.h> #include <linux/ptrace.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #define MAX_IPS 8192 -struct bpf_map_def SEC("maps") ip_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u64), - .value_size = sizeof(u32), - .max_entries = MAX_IPS, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u32); + __uint(max_entries, MAX_IPS); +} ip_map SEC(".maps"); SEC("perf_event") int do_sample(struct bpf_perf_event_data *ctx) diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 4ed690b907ff..9283f47844fb 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -1,34 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sampleip: sample instruction pointer and frequency count in a BPF map. * * Copyright 2016 Netflix, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <stdlib.h> -#include <stdio.h> #include <unistd.h> #include <errno.h> #include <signal.h> #include <string.h> -#include <assert.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/bpf.h> -#include <sys/ioctl.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" +#include "trace_helpers.h" #define DEFAULT_FREQ 99 #define DEFAULT_SECS 5 #define MAX_IPS 8192 -#define PAGE_OFFSET 0xffff880000000000 +static int map_fd; static int nr_cpus; +static long _text_addr; static void usage(void) { @@ -37,9 +33,10 @@ static void usage(void) printf(" duration # sampling duration (seconds), default 5\n"); } -static int sampling_start(int *pmu_fd, int freq) +static int sampling_start(int freq, struct bpf_program *prog, + struct bpf_link *links[]) { - int i; + int i, pmu_fd; struct perf_event_attr pe_sample_attr = { .type = PERF_TYPE_SOFTWARE, @@ -50,26 +47,30 @@ static int sampling_start(int *pmu_fd, int freq) }; for (i = 0; i < nr_cpus; i++) { - pmu_fd[i] = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, + pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, -1 /* group_fd */, 0 /* flags */); - if (pmu_fd[i] < 0) { + if (pmu_fd < 0) { fprintf(stderr, "ERROR: Initializing perf sampling\n"); return 1; } - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, - prog_fd[0]) == 0); - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: Attach perf event\n"); + links[i] = NULL; + close(pmu_fd); + return 1; + } } return 0; } -static void sampling_end(int *pmu_fd) +static void sampling_end(struct bpf_link *links[]) { int i; for (i = 0; i < nr_cpus; i++) - close(pmu_fd[i]); + bpf_link__destroy(links[i]); } struct ipcount { @@ -107,8 +108,13 @@ static void print_ip_map(int fd) /* sort and print */ qsort(counts, max, sizeof(struct ipcount), count_cmp); for (i = 0; i < max; i++) { - if (counts[i].ip > PAGE_OFFSET) { + if (counts[i].ip > _text_addr) { sym = ksym_search(counts[i].ip); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + continue; + } + printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, counts[i].count); } else { @@ -126,14 +132,17 @@ static void print_ip_map(int fd) static void int_exit(int sig) { printf("\n"); - print_ip_map(map_fd[0]); + print_ip_map(map_fd); exit(0); } int main(int argc, char **argv) { + int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1; + struct bpf_object *obj = NULL; + struct bpf_program *prog; + struct bpf_link **links; char filename[256]; - int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS; /* process arguments */ while ((opt = getopt(argc, argv, "F:h")) != -1) { @@ -160,39 +169,66 @@ int main(int argc, char **argv) return 2; } + /* used to determine whether the address is kernel space */ + _text_addr = ksym_get_addr("_text"); + if (!_text_addr) { + fprintf(stderr, "ERROR: no '_text' in /proc/kallsyms\n"); + return 3; + } + /* create perf FDs for each CPU */ - nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - pmu_fd = malloc(nr_cpus * sizeof(int)); - if (pmu_fd == NULL) { - fprintf(stderr, "ERROR: malloc of pmu_fd\n"); - return 1; + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + links = calloc(nr_cpus, sizeof(struct bpf_link *)); + if (!links) { + fprintf(stderr, "ERROR: malloc of links\n"); + goto cleanup; } - /* load BPF program */ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - fprintf(stderr, "ERROR: loading BPF program (errno %d):\n", - errno); - if (strcmp(bpf_log_buf, "") == 0) - fprintf(stderr, "Try: ulimit -l unlimited\n"); - else - fprintf(stderr, "%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; } + + prog = bpf_object__find_program_by_name(obj, "do_sample"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); /* do sampling */ printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", freq, secs); - if (sampling_start(pmu_fd, freq) != 0) - return 1; + if (sampling_start(freq, prog, links) != 0) + goto cleanup; + sleep(secs); - sampling_end(pmu_fd); - free(pmu_fd); + error = 0; +cleanup: + sampling_end(links); /* output sample counts */ - print_ip_map(map_fd[0]); + if (!error) + print_ip_map(map_fd); - return 0; + free(links); + bpf_object__close(obj); + return error; } diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 6fc6e193ef1b..5b66f2401b96 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -9,10 +9,10 @@ * if (value) * (*(u64*)value) += 1; * - * - attaches this program to eth0 raw socket + * - attaches this program to loopback interface "lo" raw socket * * - every second user space reads map[tcp], map[udp], map[icmp] to see - * how many packets of given protocol were seen on eth0 + * how many packets of given protocol were seen on "lo" */ #include <stdio.h> #include <unistd.h> @@ -26,8 +26,10 @@ #include <linux/if_ether.h> #include <linux/ip.h> #include <stddef.h> -#include "libbpf.h" +#include <bpf/bpf.h> +#include "bpf_insn.h" #include "sock_example.h" +#include "bpf_util.h" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -36,8 +38,8 @@ static int test_sock(void) int sock = -1, map_fd, prog_fd, i, key; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), - 256, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value), + 256, NULL); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); goto cleanup; @@ -53,14 +55,18 @@ static int test_sock(void) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + size_t insns_cnt = ARRAY_SIZE(prog); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, insns_cnt, &opts); if (prog_fd < 0) { printf("failed to load prog '%s'\n", strerror(errno)); goto cleanup; @@ -98,7 +104,7 @@ int main(void) { FILE *f; - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void)f; return test_sock(); diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h index d8014065d479..a27d7579bc73 100644 --- a/samples/bpf/sock_example.h +++ b/samples/bpf/sock_example.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <stdlib.h> #include <stdio.h> #include <linux/unistd.h> @@ -8,7 +9,6 @@ #include <net/if.h> #include <linux/if_packet.h> #include <arpa/inet.h> -#include "libbpf.h" static inline int open_raw_sock(const char *name) { diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c deleted file mode 100644 index 533dd11a6baa..000000000000 --- a/samples/bpf/sock_flags_kern.c +++ /dev/null @@ -1,44 +0,0 @@ -#include <uapi/linux/bpf.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <uapi/linux/in.h> -#include <uapi/linux/in6.h> -#include "bpf_helpers.h" - -SEC("cgroup/sock1") -int bpf_prog1(struct bpf_sock *sk) -{ - char fmt[] = "socket: family %d type %d protocol %d\n"; - - bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); - - /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets - * ie., make ping6 fail - */ - if (sk->family == PF_INET6 && - sk->type == SOCK_RAW && - sk->protocol == IPPROTO_ICMPV6) - return 0; - - return 1; -} - -SEC("cgroup/sock2") -int bpf_prog2(struct bpf_sock *sk) -{ - char fmt[] = "socket: family %d type %d protocol %d\n"; - - bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); - - /* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets - * ie., make ping fail - */ - if (sk->family == PF_INET && - sk->type == SOCK_RAW && - sk->protocol == IPPROTO_ICMP) - return 0; - - return 1; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c index ed18e9a4909c..431c956460ad 100644 --- a/samples/bpf/sockex1_kern.c +++ b/samples/bpf/sockex1_kern.c @@ -2,14 +2,15 @@ #include <uapi/linux/if_ether.h> #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 256, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, long); + __uint(max_entries, 256); +} my_map SEC(".maps"); SEC("socket1") int bpf_prog1(struct __sk_buff *skb) diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 6cd2feb3e9b3..9e8d39e245c1 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -1,31 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> #include <linux/bpf.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> int main(int ac, char **argv) { + struct bpf_object *obj; + struct bpf_program *prog; + int map_fd, prog_fd; char filename[256]; + int i, sock, err; FILE *f; - int i, sock; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; - } + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, - sizeof(prog_fd[0])) == 0); + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { @@ -33,13 +46,13 @@ int main(int ac, char **argv) int key; key = IPPROTO_TCP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); key = IPPROTO_UDP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); key = IPPROTO_ICMP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); printf("TCP %lld UDP %lld ICMP %lld bytes\n", tcp_cnt, udp_cnt, icmp_cnt); diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index f58acfc92556..f93d9145ab8a 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -1,11 +1,12 @@ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/ip.h> #include <uapi/linux/ipv6.h> #include <uapi/linux/if_tunnel.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF @@ -14,7 +15,7 @@ struct vlan_hdr { __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -30,7 +31,6 @@ static inline int proto_ports_offset(__u64 proto) switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_DCCP: case IPPROTO_ESP: case IPPROTO_SCTP: case IPPROTO_UDPLITE: @@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { __u64 verlen; @@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto } static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { *ip_proto = load_byte(skb, nhoff + offsetof(struct ipv6hdr, nexthdr)); @@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro return nhoff; } -static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys *flow) +static inline bool flow_dissector(struct __sk_buff *skb, + struct flow_key_record *flow) { __u64 nhoff = ETH_HLEN; __u64 ip_proto; @@ -188,17 +189,17 @@ struct pair { long bytes; }; -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(__be32), - .value_size = sizeof(struct pair), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); SEC("socket2") int bpf_prog2(struct __sk_buff *skb) { - struct bpf_flow_keys flow = {}; + struct flow_key_record flow = {}; struct pair *value; u32 key; diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index 0e0207c90841..2c18471336f0 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -1,12 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> #include <linux/bpf.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> -#include <sys/resource.h> struct pair { __u64 packets; @@ -15,33 +15,42 @@ struct pair { int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + struct bpf_object *obj; + int map_fd, prog_fd; char filename[256]; + int i, sock, err; FILE *f; - int i, sock; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + err = bpf_object__load(obj); + if (err) return 1; - } + + prog_fd = bpf_program__fd(prog); + map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, - sizeof(prog_fd[0])) == 0); + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { int key = 0, next_key; struct pair value; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); printf("ip %s bytes %lld packets %lld\n", inet_ntoa((struct in_addr){htonl(next_key)}), value.bytes, value.packets); diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 95907f8d2b17..822c13242251 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -5,7 +5,6 @@ * License as published by the Free Software Foundation. */ #include <uapi/linux/bpf.h> -#include "bpf_helpers.h" #include <uapi/linux/in.h> #include <uapi/linux/if.h> #include <uapi/linux/if_ether.h> @@ -13,55 +12,22 @@ #include <uapi/linux/ipv6.h> #include <uapi/linux/if_tunnel.h> #include <uapi/linux/mpls.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF -#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F - -struct bpf_map_def SEC("maps") jmp_table = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 8, -}; - #define PARSE_VLAN 1 #define PARSE_MPLS 2 #define PARSE_IP 3 #define PARSE_IPV6 4 -/* protocol dispatch routine. - * It tail-calls next BPF program depending on eth proto - * Note, we could have used: - * bpf_tail_call(skb, &jmp_table, proto); - * but it would need large prog_array - */ -static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) -{ - switch (proto) { - case ETH_P_8021Q: - case ETH_P_8021AD: - bpf_tail_call(skb, &jmp_table, PARSE_VLAN); - break; - case ETH_P_MPLS_UC: - case ETH_P_MPLS_MC: - bpf_tail_call(skb, &jmp_table, PARSE_MPLS); - break; - case ETH_P_IP: - bpf_tail_call(skb, &jmp_table, PARSE_IP); - break; - case ETH_P_IPV6: - bpf_tail_call(skb, &jmp_table, PARSE_IPV6); - break; - } -} - struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -71,6 +37,8 @@ struct bpf_flow_keys { __u32 ip_proto; }; +static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto); + static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) { return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) @@ -88,15 +56,15 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } struct globals { - struct bpf_flow_keys flow; + struct flow_key_record flow; }; -struct bpf_map_def SEC("maps") percpu_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(struct globals), - .max_entries = 32, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct globals); + __uint(max_entries, 32); +} percpu_map SEC(".maps"); /* user poor man's per_cpu until native support is ready */ static struct globals *this_cpu_globals(void) @@ -112,16 +80,16 @@ struct pair { __u64 bytes; }; -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct bpf_flow_keys), - .value_size = sizeof(struct pair), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct flow_key_record); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); static void update_stats(struct __sk_buff *skb, struct globals *g) { - struct bpf_flow_keys key = g->flow; + struct flow_key_record key = g->flow; struct pair *value; value = bpf_map_lookup_elem(&hash_map, &key); @@ -186,7 +154,8 @@ static __always_inline void parse_ip_proto(struct __sk_buff *skb, } } -PROG(PARSE_IP)(struct __sk_buff *skb) +SEC("socket") +int bpf_func_ip(struct __sk_buff *skb) { struct globals *g = this_cpu_globals(); __u32 nhoff, verlen, ip_proto; @@ -214,7 +183,8 @@ PROG(PARSE_IP)(struct __sk_buff *skb) return 0; } -PROG(PARSE_IPV6)(struct __sk_buff *skb) +SEC("socket") +int bpf_func_ipv6(struct __sk_buff *skb) { struct globals *g = this_cpu_globals(); __u32 nhoff, ip_proto; @@ -237,7 +207,8 @@ PROG(PARSE_IPV6)(struct __sk_buff *skb) return 0; } -PROG(PARSE_VLAN)(struct __sk_buff *skb) +SEC("socket") +int bpf_func_vlan(struct __sk_buff *skb) { __u32 nhoff, proto; @@ -253,7 +224,8 @@ PROG(PARSE_VLAN)(struct __sk_buff *skb) return 0; } -PROG(PARSE_MPLS)(struct __sk_buff *skb) +SEC("socket") +int bpf_func_mpls(struct __sk_buff *skb) { __u32 nhoff, label; @@ -276,7 +248,49 @@ PROG(PARSE_MPLS)(struct __sk_buff *skb) return 0; } -SEC("socket/0") +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(max_entries, 8); + __array(values, u32 (void *)); +} prog_array_init SEC(".maps") = { + .values = { + [PARSE_VLAN] = (void *)&bpf_func_vlan, + [PARSE_IP] = (void *)&bpf_func_ip, + [PARSE_IPV6] = (void *)&bpf_func_ipv6, + [PARSE_MPLS] = (void *)&bpf_func_mpls, + }, +}; + +/* Protocol dispatch routine. It tail-calls next BPF program depending + * on eth proto. Note, we could have used ... + * + * bpf_tail_call(skb, &prog_array_init, proto); + * + * ... but it would need large prog_array and cannot be optimised given + * the map key is not static. + */ +static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) +{ + switch (proto) { + case ETH_P_8021Q: + case ETH_P_8021AD: + bpf_tail_call(skb, &prog_array_init, PARSE_VLAN); + break; + case ETH_P_MPLS_UC: + case ETH_P_MPLS_MC: + bpf_tail_call(skb, &prog_array_init, PARSE_MPLS); + break; + case ETH_P_IP: + bpf_tail_call(skb, &prog_array_init, PARSE_IP); + break; + case ETH_P_IPV6: + bpf_tail_call(skb, &prog_array_init, PARSE_IPV6); + break; + } +} + +SEC("socket") int main_prog(struct __sk_buff *skb) { __u32 nhoff = ETH_HLEN; diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 877ecf8fc5ac..56044acbd25d 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -1,18 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> -#include <linux/bpf.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> -#include <sys/resource.h> -#define PARSE_IP 3 -#define PARSE_IP_PROG_FD (prog_fd[0]) -#define PROG_ARRAY_FD (map_fd[0]) - -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -29,47 +24,66 @@ struct pair { int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int i, sock, fd, main_prog_fd, hash_map_fd; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; FILE *f; - int i, sock, err, id, key = PARSE_IP; - struct bpf_prog_info info = {}; - uint32_t info_len = sizeof(info); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + if (hash_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - /* Test fd array lookup which returns the id of the bpf_prog */ - err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len); - assert(!err); - err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id); - assert(!err); - assert(id == info.id); + /* find BPF main program */ + main_prog_fd = 0; + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + + if (!strcmp(bpf_program__name(prog), "main_prog")) + main_prog_fd = fd; + } + + if (main_prog_fd == 0) { + fprintf(stderr, "ERROR: can't find main_prog\n"); + goto cleanup; + } sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + /* attach BPF program to socket */ + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, sizeof(__u32)) == 0); if (argc > 1) - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); else f = popen("netperf -l 4 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { - struct bpf_flow_keys key = {}, next_key; + struct flow_key_record key = {}, next_key; struct pair value; sleep(1); printf("IP src.port -> dst.port bytes packets\n"); - while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[2], &next_key, &value); + while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(hash_map_fd, &next_key, &value); printf("%s.%05d -> %s.%05d %12lld %12lld\n", inet_ntoa((struct in_addr){htonl(next_key.src)}), next_key.port16[0], @@ -79,5 +93,8 @@ int main(int argc, char **argv) key = next_key; } } + +cleanup: + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/spintest.bpf.c b/samples/bpf/spintest.bpf.c new file mode 100644 index 000000000000..cba5a9d50783 --- /dev/null +++ b/samples/bpf/spintest.bpf.c @@ -0,0 +1,60 @@ +/* Copyright (c) 2016, Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "vmlinux.h" +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_map2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); + +#define PROG(foo) \ +int foo(struct pt_regs *ctx) \ +{ \ + long v = PT_REGS_IP(ctx), *val; \ +\ + val = bpf_map_lookup_elem(&my_map, &v); \ + bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \ + bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \ + bpf_map_delete_elem(&my_map2, &v); \ + bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \ + return 0; \ +} + +/* add kprobes to all possible *spin* functions */ +SEC("kprobe.multi/spin_*lock*")PROG(spin_lock) +SEC("kprobe.multi/*_spin_on_owner")PROG(spin_on_owner) +SEC("kprobe.multi/_raw_spin_*lock*")PROG(raw_spin_lock) + +/* and to inner bpf helpers */ +SEC("kprobe/htab_map_update_elem")PROG(p15) +SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16) +SEC("kprobe/htab_map_alloc")PROG(p17) + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c deleted file mode 100644 index ce0167d09cdc..000000000000 --- a/samples/bpf/spintest_kern.c +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016, Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/version.h> -#include <uapi/linux/bpf.h> -#include <uapi/linux/perf_event.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; -struct bpf_map_def SEC("maps") my_map2 = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; - -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; - -#define PROG(foo) \ -int foo(struct pt_regs *ctx) \ -{ \ - long v = PT_REGS_IP(ctx), *val; \ -\ - val = bpf_map_lookup_elem(&my_map, &v); \ - bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \ - bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \ - bpf_map_delete_elem(&my_map2, &v); \ - bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \ - return 0; \ -} - -/* add kprobes to all possible *spin* functions */ -SEC("kprobe/spin_unlock")PROG(p1) -SEC("kprobe/spin_lock")PROG(p2) -SEC("kprobe/mutex_spin_on_owner")PROG(p3) -SEC("kprobe/rwsem_spin_on_owner")PROG(p4) -SEC("kprobe/spin_unlock_irqrestore")PROG(p5) -SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6) -SEC("kprobe/_raw_spin_unlock_bh")PROG(p7) -SEC("kprobe/_raw_spin_unlock")PROG(p8) -SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9) -SEC("kprobe/_raw_spin_trylock_bh")PROG(p10) -SEC("kprobe/_raw_spin_lock_irq")PROG(p11) -SEC("kprobe/_raw_spin_trylock")PROG(p12) -SEC("kprobe/_raw_spin_lock")PROG(p13) -SEC("kprobe/_raw_spin_lock_bh")PROG(p14) -/* and to inner bpf helpers */ -SEC("kprobe/htab_map_update_elem")PROG(p15) -SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16) -SEC("kprobe/htab_map_alloc")PROG(p17) - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 80676c25fa50..55971edb1088 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -1,50 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <unistd.h> -#include <linux/bpf.h> #include <string.h> #include <assert.h> -#include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include "trace_helpers.h" int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + struct bpf_link *links[20]; long key, next_key, value; + struct bpf_program *prog; + int map_fd, i, j = 0; char filename[256]; struct ksym *sym; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; i < 5; i++) { key = 0; printf("kprobing funcs:"); - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); assert(next_key == value); sym = ksym_search(value); - printf(" %s", sym->name); key = next_key; + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + continue; + } + + printf(" %s", sym->name); } if (key) printf("\n"); key = 0; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) - bpf_map_delete_elem(map_fd[0], &next_key); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) + bpf_map_delete_elem(map_fd, &next_key); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c index ce2a30b11248..a6e600f3d477 100644 --- a/samples/bpf/syscall_nrs.c +++ b/samples/bpf/syscall_nrs.c @@ -1,6 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 #include <uapi/linux/unistd.h> #include <linux/kbuild.h> +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + #define SYSNR(_NR) DEFINE(SYS ## _NR, _NR) void syscall_defines(void) @@ -8,5 +12,13 @@ void syscall_defines(void) COMMENT("Linux system call numbers."); SYSNR(__NR_write); SYSNR(__NR_read); +#ifdef __NR_mmap2 + SYSNR(__NR_mmap2); +#endif +#ifdef __NR_mmap SYSNR(__NR_mmap); +#endif + } + +#pragma GCC diagnostic pop diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c new file mode 100644 index 000000000000..58fef969a60e --- /dev/null +++ b/samples/bpf/syscall_tp_kern.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Facebook + */ +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#if !defined(__aarch64__) +struct syscalls_enter_open_args { + unsigned long long unused; + long syscall_nr; + long filename_ptr; + long flags; + long mode; +}; +#endif + +struct syscalls_exit_open_args { + unsigned long long unused; + long syscall_nr; + long ret; +}; + +struct syscalls_enter_open_at_args { + unsigned long long unused; + long syscall_nr; + long long dfd; + long filename_ptr; + long flags; + long mode; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} enter_open_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} exit_open_map SEC(".maps"); + +static __always_inline void count(void *map) +{ + u32 key = 0; + u32 *value, init_val = 1; + + value = bpf_map_lookup_elem(map, &key); + if (value) + *value += 1; + else + bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST); +} + +#if !defined(__aarch64__) +SEC("tracepoint/syscalls/sys_enter_open") +int trace_enter_open(struct syscalls_enter_open_args *ctx) +{ + count(&enter_open_map); + return 0; +} +#endif + +SEC("tracepoint/syscalls/sys_enter_openat") +int trace_enter_open_at(struct syscalls_enter_open_at_args *ctx) +{ + count(&enter_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_enter_openat2") +int trace_enter_open_at2(struct syscalls_enter_open_at_args *ctx) +{ + count(&enter_open_map); + return 0; +} + +#if !defined(__aarch64__) +SEC("tracepoint/syscalls/sys_exit_open") +int trace_enter_exit(struct syscalls_exit_open_args *ctx) +{ + count(&exit_open_map); + return 0; +} +#endif + +SEC("tracepoint/syscalls/sys_exit_openat") +int trace_enter_exit_at(struct syscalls_exit_open_args *ctx) +{ + count(&exit_open_map); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_openat2") +int trace_enter_exit_at2(struct syscalls_exit_open_args *ctx) +{ + count(&exit_open_map); + return 0; +} diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c new file mode 100644 index 000000000000..7a09ac74fac0 --- /dev/null +++ b/samples/bpf/syscall_tp_user.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Facebook + */ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <linux/perf_event.h> +#include <errno.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. + * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. + */ + +static void usage(const char *cmd) +{ + printf("USAGE: %s [-i nr_tests] [-h]\n", cmd); + printf(" -i nr_tests # rounds of test to run\n"); + printf(" -h # help\n"); +} + +static void verify_map(int map_id) +{ + __u32 key = 0; + __u32 val; + + if (bpf_map_lookup_elem(map_id, &key, &val) != 0) { + fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); + return; + } + if (val == 0) { + fprintf(stderr, "failed: map #%d returns value 0\n", map_id); + return; + } + + printf("verify map:%d val: %d\n", map_id, val); + + val = 0; + if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { + fprintf(stderr, "map_update failed: %s\n", strerror(errno)); + return; + } +} + +static int test(char *filename, int nr_tests) +{ + int map0_fds[nr_tests], map1_fds[nr_tests], fd, i, j = 0; + struct bpf_link **links = NULL; + struct bpf_object *objs[nr_tests]; + struct bpf_program *prog; + + for (i = 0; i < nr_tests; i++) { + objs[i] = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(objs[i])) { + fprintf(stderr, "opening BPF object file failed\n"); + objs[i] = NULL; + goto cleanup; + } + + /* One-time initialization */ + if (!links) { + int nr_progs = 0; + + bpf_object__for_each_program(prog, objs[i]) + nr_progs += 1; + + links = calloc(nr_progs * nr_tests, sizeof(struct bpf_link *)); + + if (!links) + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(objs[i])) { + fprintf(stderr, "loading BPF object file failed\n"); + goto cleanup; + } + + map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "enter_open_map"); + map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "exit_open_map"); + if (map0_fds[i] < 0 || map1_fds[i] < 0) { + fprintf(stderr, "finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, objs[i]) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]); + } + + /* current load_bpf_file has perf_event_open default pid = -1 + * and cpu = 0, which permits attached bpf execution on + * all cpus for all pid's. bpf program execution ignores + * cpu affinity. + */ + /* trigger some "open" operations */ + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open failed: %s\n", strerror(errno)); + return 1; + } + close(fd); + + /* verify the map */ + for (i = 0; i < nr_tests; i++) { + verify_map(map0_fds[i]); + verify_map(map1_fds[i]); + } + +cleanup: + if (links) { + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + free(links); + } + + for (i--; i >= 0; i--) + bpf_object__close(objs[i]); + return 0; +} + +int main(int argc, char **argv) +{ + int opt, nr_tests = 1; + char filename[256]; + + while ((opt = getopt(argc, argv, "i:h")) != -1) { + switch (opt) { + case 'i': + nr_tests = atoi(optarg); + break; + case 'h': + default: + usage(argv[0]); + return 0; + } + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + return test(filename, nr_tests); +} diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c new file mode 100644 index 000000000000..186ac0a79c0a --- /dev/null +++ b/samples/bpf/task_fd_query_kern.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("kprobe/blk_mq_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + return 0; +} + +SEC("kretprobe/__blk_account_io_done") +int bpf_prog2(struct pt_regs *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c new file mode 100644 index 000000000000..1e61f2180470 --- /dev/null +++ b/samples/bpf/task_fd_query_user.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <stdint.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <linux/perf_event.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" +#include "perf-sys.h" +#include "trace_helpers.h" + +static struct bpf_program *progs[2]; +static struct bpf_link *links[2]; + +#define CHECK_PERROR_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("FAIL: %s:\n", __func__); \ + perror(" "); \ + return -1; \ + } \ +}) + +#define CHECK_AND_RET(condition) ({ \ + int __ret = !!(condition); \ + if (__ret) \ + return -1; \ +}) + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" +static int bpf_find_probe_type(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + errno = 0; + ret = (int)strtol(buf, NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" +static int bpf_get_retprobe_bit(const char *event_type) +{ + char buf[256]; + int fd, ret; + + ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + + fd = open(buf, O_RDONLY); + CHECK_PERROR_RET(fd < 0); + + ret = read(fd, buf, sizeof(buf)); + close(fd); + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); + CHECK_PERROR_RET(strlen(buf) < strlen("config:")); + + errno = 0; + ret = (int)strtol(buf + strlen("config:"), NULL, 10); + CHECK_PERROR_RET(errno); + return ret; +} + +static int test_debug_fs_kprobe(int link_idx, const char *fn_name, + __u32 expected_fd_type) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + int err, event_fd; + char buf[256]; + + len = sizeof(buf); + event_fd = bpf_link__fd(links[link_idx]); + err = bpf_task_fd_query(getpid(), event_fd, 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", + __func__, link_idx, fn_name); + perror(" :"); + return -1; + } + if (strcmp(buf, fn_name) != 0 || + fd_type != expected_fd_type || + probe_offset != 0x0 || probe_addr != 0x0) { + printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", + link_idx); + printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," + " probe_addr: 0x%llx\n", + buf, fd_type, probe_offset, probe_addr); + return -1; + } + return 0; +} + +static int test_nondebug_fs_kuprobe_common(const char *event_type, + const char *name, __u64 offset, __u64 addr, bool is_return, + char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr) +{ + int is_return_bit = bpf_get_retprobe_bit(event_type); + int type = bpf_find_probe_type(event_type); + struct perf_event_attr attr = {}; + struct bpf_link *link; + int fd, err = -1; + + if (type < 0 || is_return_bit < 0) { + printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", + __func__, type, is_return_bit); + return err; + } + + attr.sample_period = 1; + attr.wakeup_events = 1; + if (is_return) + attr.config |= 1 << is_return_bit; + + if (name) { + attr.config1 = ptr_to_u64((void *)name); + attr.config2 = offset; + } else { + attr.config1 = 0; + attr.config2 = addr; + } + attr.size = sizeof(attr); + attr.type = type; + + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); + link = bpf_program__attach_perf_event(progs[0], fd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(fd); + goto cleanup; + } + + CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, + prog_id, fd_type, probe_offset, probe_addr) < 0); + err = 0; + +cleanup: + bpf_link__destroy(link); + return err; +} + +static int test_nondebug_fs_probe(const char *event_type, const char *name, + __u64 offset, __u64 addr, bool is_return, + __u32 expected_fd_type, + __u32 expected_ret_fd_type, + char *buf, __u32 buf_len) +{ + __u64 probe_offset, probe_addr; + __u32 prog_id, fd_type; + int err; + + err = test_nondebug_fs_kuprobe_common(event_type, name, + offset, addr, is_return, + buf, &buf_len, &prog_id, + &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, " + "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n", + __func__, name ? name : "", offset, addr, is_return); + perror(" :"); + return -1; + } + if ((is_return && fd_type != expected_ret_fd_type) || + (!is_return && fd_type != expected_fd_type)) { + printf("FAIL: %s, incorrect fd_type %u\n", + __func__, fd_type); + return -1; + } + if (name) { + if (strcmp(name, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", + __func__, probe_offset); + return -1; + } + } else { + if (buf_len != 0) { + printf("FAIL: %s, incorrect buf %p\n", + __func__, buf); + return -1; + } + + if (probe_addr != addr) { + printf("FAIL: %s, incorrect probe_addr 0x%llx\n", + __func__, probe_addr); + return -1; + } + } + return 0; +} + +static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) +{ + char buf[256], event_alias[sizeof("test_1234567890")]; + const char *event_type = "uprobe"; + struct perf_event_attr attr = {}; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + int err = -1, res, kfd, efd; + struct bpf_link *link; + ssize_t bytes; + + snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events", + event_type); + kfd = open(buf, O_WRONLY | O_TRUNC, 0); + CHECK_PERROR_RET(kfd < 0); + + res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); + CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias)); + + res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", + is_return ? 'r' : 'p', event_type, event_alias, + binary_path, offset); + CHECK_PERROR_RET(res < 0 || res >= sizeof(buf)); + CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0); + + close(kfd); + kfd = -1; + + snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s/id", + event_type, event_alias); + efd = open(buf, O_RDONLY, 0); + CHECK_PERROR_RET(efd < 0); + + bytes = read(efd, buf, sizeof(buf)); + CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf)); + close(efd); + buf[bytes] = '\0'; + + attr.config = strtol(buf, NULL, 0); + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_period = 1; + attr.wakeup_events = 1; + + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + link = bpf_program__attach_perf_event(progs[0], kfd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(kfd); + goto cleanup; + } + + len = sizeof(buf); + err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, + &prog_id, &fd_type, &probe_offset, + &probe_addr); + if (err < 0) { + printf("FAIL: %s, binary_path %s\n", __func__, binary_path); + perror(" :"); + return -1; + } + if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) || + (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) { + printf("FAIL: %s, incorrect fd_type %u\n", __func__, + fd_type); + return -1; + } + if (strcmp(binary_path, buf) != 0) { + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); + return -1; + } + if (probe_offset != offset) { + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__, + probe_offset); + return -1; + } + err = 0; + +cleanup: + bpf_link__destroy(link); + return err; +} + +int main(int argc, char **argv) +{ + extern char __executable_start; + char filename[256], buf[256]; + __u64 uprobe_file_offset; + struct bpf_program *prog; + struct bpf_object *obj; + int i = 0, err = -1; + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return err; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + progs[i] = prog; + links[i] = bpf_program__attach(progs[i]); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; + } + + /* test two functions in the corresponding *_kern.c file */ + CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", + BPF_FD_TYPE_KPROBE)); + CHECK_AND_RET(test_debug_fs_kprobe(1, "__blk_account_io_done", + BPF_FD_TYPE_KRETPROBE)); + + /* test nondebug fs kprobe */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#ifdef __x86_64__ + /* set a kprobe on "bpf_check + 0x5", which is x64 specific */ + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0, + false, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); +#endif + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, + true, BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), false, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + NULL, 0)); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, + ksym_get_addr("bpf_check"), true, + BPF_FD_TYPE_KPROBE, + BPF_FD_TYPE_KRETPROBE, + 0, 0)); + + /* test nondebug fs uprobe */ + /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64 + * and the default linker script, which defines __executable_start as + * the start of the .text section. The calculation could be different + * on different systems with different compilers. The right way is + * to parse the ELF file. We took a shortcut here. + */ + uprobe_file_offset = (unsigned long)main - (unsigned long)&__executable_start; + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, false, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], + uprobe_file_offset, 0x0, true, + BPF_FD_TYPE_UPROBE, + BPF_FD_TYPE_URETPROBE, + buf, sizeof(buf))); + + /* test debug fs uprobe */ + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + false)); + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, + true)); + err = 0; + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return err; +} diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh index 80a05591a140..a28a8fc99dbe 100755 --- a/samples/bpf/tc_l2_redirect.sh +++ b/samples/bpf/tc_l2_redirect.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 [[ -z $TC ]] && TC='tc' [[ -z $IP ]] && IP='ip' @@ -7,6 +8,7 @@ REDIRECT_USER='./tc_l2_redirect' REDIRECT_BPF='./tc_l2_redirect_kern.o' RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter) +IPV6_DISABLED=$(< /proc/sys/net/ipv6/conf/all/disable_ipv6) IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding) function config_common { @@ -63,6 +65,7 @@ function config_common { sysctl -q -w net.ipv4.conf.all.rp_filter=0 sysctl -q -w net.ipv6.conf.all.forwarding=1 + sysctl -q -w net.ipv6.conf.all.disable_ipv6=0 } function cleanup { @@ -76,6 +79,7 @@ function cleanup { $IP link del ip6t >& /dev/null sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING + sysctl -q -w net.ipv6.conf.all.disable_ipv6=$IPV6_DISABLED rm -f /sys/fs/bpf/tc/globals/tun_iface [[ -z $DEBUG ]] || set -x set -e diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c index 7ef2a12b25b2..b19fa9b88fe0 100644 --- a/samples/bpf/tc_l2_redirect_kern.c +++ b/samples/bpf/tc_l2_redirect_kern.c @@ -15,7 +15,7 @@ #include <uapi/linux/filter.h> #include <uapi/linux/pkt_cls.h> #include <net/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #define _htonl __builtin_bswap32 @@ -58,14 +58,11 @@ static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr) SEC("l2_to_iptun_ingress_forward") int _l2_to_iptun_ingress_forward(struct __sk_buff *skb) { - struct bpf_tunnel_key tkey = {}; void *data = (void *)(long)skb->data; struct eth_hdr *eth = data; void *data_end = (void *)(long)skb->data_end; int key = 0, *ifindex; - int ret; - if (data + sizeof(*eth) > data_end) return TC_ACT_OK; @@ -115,8 +112,6 @@ int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb) void *data_end = (void *)(long)skb->data_end; int key = 0, *ifindex; - int ret; - if (data + sizeof(*eth) > data_end) return TC_ACT_OK; @@ -205,7 +200,6 @@ int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb) SEC("drop_non_tun_vip") int _drop_non_tun_vip(struct __sk_buff *skb) { - struct bpf_tunnel_key tkey = {}; void *data = (void *)(long)skb->data; struct eth_hdr *eth = data; void *data_end = (void *)(long)skb->data_end; diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c index 28995a776560..d11a6e1e9912 100644 --- a/samples/bpf/tc_l2_redirect_user.c +++ b/samples/bpf/tc_l2_redirect_user.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/unistd.h> #include <linux/bpf.h> @@ -13,7 +10,7 @@ #include <string.h> #include <errno.h> -#include "libbpf.h" +#include <bpf/bpf.h> static void usage(void) { diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c index 274c884c87fe..e9356130f84e 100644 --- a/samples/bpf/tcbpf1_kern.c +++ b/samples/bpf/tcbpf1_kern.c @@ -7,7 +7,8 @@ #include <uapi/linux/tcp.h> #include <uapi/linux/filter.h> #include <uapi/linux/pkt_cls.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" /* compiler workaround */ #define _htonl __builtin_bswap32 diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c deleted file mode 100644 index 270edcc149a1..000000000000 --- a/samples/bpf/tcbpf2_kern.c +++ /dev/null @@ -1,382 +0,0 @@ -/* Copyright (c) 2016 VMware - * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <uapi/linux/if_ether.h> -#include <uapi/linux/if_packet.h> -#include <uapi/linux/ip.h> -#include <uapi/linux/ipv6.h> -#include <uapi/linux/in.h> -#include <uapi/linux/tcp.h> -#include <uapi/linux/filter.h> -#include <uapi/linux/pkt_cls.h> -#include <net/ipv6.h> -#include "bpf_helpers.h" - -#define _htonl __builtin_bswap32 -#define ERROR(ret) do {\ - char fmt[] = "ERROR line:%d ret:%d\n";\ - bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \ - } while(0) - -struct geneve_opt { - __be16 opt_class; - u8 type; - u8 length:5; - u8 r3:1; - u8 r2:1; - u8 r1:1; - u8 opt_data[8]; /* hard-coded to 8 byte */ -}; - -struct vxlan_metadata { - u32 gbp; -}; - -SEC("gre_set_tunnel") -int _gre_set_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - - __builtin_memset(&key, 0x0, sizeof(key)); - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - key.tunnel_id = 2; - key.tunnel_tos = 0; - key.tunnel_ttl = 64; - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("gre_get_tunnel") -int _gre_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - char fmt[] = "key %d remote ip 0x%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4); - return TC_ACT_OK; -} - -SEC("vxlan_set_tunnel") -int _vxlan_set_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - struct vxlan_metadata md; - - __builtin_memset(&key, 0x0, sizeof(key)); - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - key.tunnel_id = 2; - key.tunnel_tos = 0; - key.tunnel_ttl = 64; - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - md.gbp = 0x800FF; /* Set VXLAN Group Policy extension */ - ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("vxlan_get_tunnel") -int _vxlan_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - struct vxlan_metadata md; - char fmt[] = "key %d remote ip 0x%x vxlan gbp 0x%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), - key.tunnel_id, key.remote_ipv4, md.gbp); - - return TC_ACT_OK; -} - -SEC("geneve_set_tunnel") -int _geneve_set_tunnel(struct __sk_buff *skb) -{ - int ret, ret2; - struct bpf_tunnel_key key; - struct geneve_opt gopt; - - __builtin_memset(&key, 0x0, sizeof(key)); - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - key.tunnel_id = 2; - key.tunnel_tos = 0; - key.tunnel_ttl = 64; - - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = 0x102; /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = 0xdeadbeef; - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("geneve_get_tunnel") -int _geneve_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - struct geneve_opt gopt; - char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), - key.tunnel_id, key.remote_ipv4, gopt.opt_class); - return TC_ACT_OK; -} - -SEC("ipip_set_tunnel") -int _ipip_set_tunnel(struct __sk_buff *skb) -{ - struct bpf_tunnel_key key = {}; - void *data = (void *)(long)skb->data; - struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); - void *data_end = (void *)(long)skb->data_end; - int ret; - - /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { - ERROR(1); - return TC_ACT_SHOT; - } - - key.tunnel_ttl = 64; - if (iph->protocol == IPPROTO_ICMP) { - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - } else { - if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) - return TC_ACT_SHOT; - - if (tcp->dest == htons(5200)) - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - else if (tcp->dest == htons(5201)) - key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */ - else - return TC_ACT_SHOT; - } - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("ipip_get_tunnel") -int _ipip_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - char fmt[] = "remote ip 0x%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4); - return TC_ACT_OK; -} - -SEC("ipip6_set_tunnel") -int _ipip6_set_tunnel(struct __sk_buff *skb) -{ - struct bpf_tunnel_key key = {}; - void *data = (void *)(long)skb->data; - struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); - void *data_end = (void *)(long)skb->data_end; - int ret; - - /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { - ERROR(1); - return TC_ACT_SHOT; - } - - key.remote_ipv6[0] = _htonl(0x2401db00); - key.tunnel_ttl = 64; - - if (iph->protocol == IPPROTO_ICMP) { - key.remote_ipv6[3] = _htonl(1); - } else { - if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) { - ERROR(iph->protocol); - return TC_ACT_SHOT; - } - - if (tcp->dest == htons(5200)) { - key.remote_ipv6[3] = _htonl(1); - } else if (tcp->dest == htons(5201)) { - key.remote_ipv6[3] = _htonl(2); - } else { - ERROR(tcp->dest); - return TC_ACT_SHOT; - } - } - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("ipip6_get_tunnel") -int _ipip6_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - char fmt[] = "remote ip6 %x::%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]), - _htonl(key.remote_ipv6[3])); - return TC_ACT_OK; -} - -SEC("ip6ip6_set_tunnel") -int _ip6ip6_set_tunnel(struct __sk_buff *skb) -{ - struct bpf_tunnel_key key = {}; - void *data = (void *)(long)skb->data; - struct ipv6hdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); - void *data_end = (void *)(long)skb->data_end; - int ret; - - /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { - ERROR(1); - return TC_ACT_SHOT; - } - - key.remote_ipv6[0] = _htonl(0x2401db00); - key.tunnel_ttl = 64; - - if (iph->nexthdr == NEXTHDR_ICMP) { - key.remote_ipv6[3] = _htonl(1); - } else { - if (iph->nexthdr != NEXTHDR_TCP) { - ERROR(iph->nexthdr); - return TC_ACT_SHOT; - } - - if (tcp->dest == htons(5200)) { - key.remote_ipv6[3] = _htonl(1); - } else if (tcp->dest == htons(5201)) { - key.remote_ipv6[3] = _htonl(2); - } else { - ERROR(tcp->dest); - return TC_ACT_SHOT; - } - } - - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - return TC_ACT_OK; -} - -SEC("ip6ip6_get_tunnel") -int _ip6ip6_get_tunnel(struct __sk_buff *skb) -{ - int ret; - struct bpf_tunnel_key key; - char fmt[] = "remote ip6 %x::%x\n"; - - ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } - - bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]), - _htonl(key.remote_ipv6[3])); - return TC_ACT_OK; -} - - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000000..822b0742b815 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,71 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv))) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme new file mode 100644 index 000000000000..78e247f62108 --- /dev/null +++ b/samples/bpf/tcp_bpf.readme @@ -0,0 +1,28 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2. To load +(attach) one of the tcp_*_kern.o programs: + + bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog + bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog + bpftool prog tracelog + +"bpftool prog tracelog" will continue to run printing the BPF log buffer. +The tcp_*_kern.o programs use special print functions to print logging +information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index ee83bbabd17c..6a80d08952ad 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -9,7 +9,7 @@ * doing appropriate checks that indicate the hosts are far enough * away (i.e. large RTT). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -17,18 +17,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_bufs(struct bpf_sock_ops *skops) { @@ -41,8 +34,10 @@ int bpf_bufs(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -61,8 +56,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Nothing to do */ @@ -71,8 +66,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index d68eadd9ca2d..e88bd9ab0695 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -9,7 +9,7 @@ * the same datacenter. For his example, we assume they are within the same * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -17,18 +17,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_clamp(struct bpf_sock_ops *skops) { @@ -41,8 +34,10 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* For testing purposes, only execute rest of BPF program * if neither port numberis 55601 */ - if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) - return -1; + if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 0; + } op = (int) skops->op; @@ -66,9 +61,9 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, @@ -80,12 +75,12 @@ int bpf_clamp(struct bpf_sock_ops *skops) rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SNDCWND_CLAMP, &clamp, sizeof(clamp)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_SNDBUF, &bufsize, - sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index dac15bce1fa9..339415eac477 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -5,9 +5,9 @@ * License as published by the Free Software Foundation. * * BPF program to set congestion control to dctcp when both hosts are - * in the same datacenter (as deteremined by IPv6 prefix). + * in the same datacenter (as determined by IPv6 prefix). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -16,18 +16,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_cong(struct bpf_sock_ops *skops) { @@ -39,8 +32,10 @@ int bpf_cong(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c new file mode 100644 index 000000000000..e80d3afd24bd --- /dev/null +++ b/samples/bpf/tcp_dumpstats_kern.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Refer to samples/bpf/tcp_bpf.readme for the instructions on + * how to run this sample program. + */ +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define INTERVAL 1000000000ULL + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +struct { + __u32 type; + __u32 map_flags; + int *key; + __u64 *value; +} bpf_next_dump SEC(".maps") = { + .type = BPF_MAP_TYPE_SK_STORAGE, + .map_flags = BPF_F_NO_PREALLOC, +}; + +SEC("sockops") +int _sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_tcp_sock *tcp_sk; + struct bpf_sock *sk; + __u64 *next_dump; + __u64 now; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG); + return 1; + case BPF_SOCK_OPS_RTT_CB: + break; + default: + return 1; + } + + sk = ctx->sk; + if (!sk) + return 1; + + next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!next_dump) + return 1; + + now = bpf_ktime_get_ns(); + if (now < *next_dump) + return 1; + + tcp_sk = bpf_tcp_sock(sk); + if (!tcp_sk) + return 1; + + *next_dump = now + INTERVAL; + + bpf_printk("dsack_dups=%u delivered=%u\n", + tcp_sk->dsack_dups, tcp_sk->delivered); + bpf_printk("delivered_ce=%u icsk_retransmits=%u\n", + tcp_sk->delivered_ce, tcp_sk->icsk_retransmits); + + return 1; +} diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index 23c5122ef819..d1444557358e 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -9,7 +9,7 @@ * would usually be done after doing appropriate checks that indicate * the hosts are far enough away (i.e. large RTT). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -17,18 +17,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_iw(struct bpf_sock_ops *skops) { @@ -42,8 +35,10 @@ int bpf_iw(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -62,8 +57,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, @@ -73,8 +68,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index 3f2a228f81ce..223d9c23b10c 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -8,7 +8,7 @@ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this * example that means both hosts are not the same datacenter). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -16,18 +16,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_rwnd(struct bpf_sock_ops *skops) { @@ -38,8 +31,10 @@ int bpf_rwnd(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != - 55601 && skops->local_port != 55601) - return -1; + 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index 3c3fc83d81cb..d58004eef124 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -8,7 +8,7 @@ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example * that means both hosts are in the same datacenter). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> @@ -16,18 +16,11 @@ #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include <linux/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_synrto(struct bpf_sock_ops *skops) { @@ -38,8 +31,10 @@ int bpf_synrto(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c new file mode 100644 index 000000000000..953fedc79ce1 --- /dev/null +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + * + * BPF program to automatically reflect TOS option from received syn packet + * + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <linux/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define DEBUG 1 + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char header[sizeof(struct ipv6hdr)]; + struct ipv6hdr *hdr6; + struct iphdr *hdr; + int hdr_size = 0; + int save_syn = 1; + int tos = 0; + int rv = 0; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + switch (op) { + case BPF_SOCK_OPS_TCP_LISTEN_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, + &save_syn, sizeof(save_syn)); + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + if (skops->family == AF_INET) + hdr_size = sizeof(struct iphdr); + else + hdr_size = sizeof(struct ipv6hdr); + rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN, + header, hdr_size); + if (!rv) { + if (skops->family == AF_INET) { + hdr = (struct iphdr *) header; + tos = hdr->tos; + if (tos != 0) + bpf_setsockopt(skops, SOL_IP, IP_TOS, + &tos, sizeof(tos)); + } else { + hdr6 = (struct ipv6hdr *) header; + tos = ((hdr6->priority) << 4 | + (hdr6->flow_lbl[0]) >> 4); + if (tos) + bpf_setsockopt(skops, SOL_IPV6, + IPV6_TCLASS, + &tos, sizeof(tos)); + } + rv = 0; + } + break; + default: + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c deleted file mode 100644 index 8a1b8b5d8def..000000000000 --- a/samples/bpf/test_cgrp2_array_pin.c +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/unistd.h> -#include <linux/bpf.h> - -#include <stdio.h> -#include <stdint.h> -#include <unistd.h> -#include <string.h> -#include <errno.h> -#include <fcntl.h> - -#include "libbpf.h" - -static void usage(void) -{ - printf("Usage: test_cgrp2_array_pin [...]\n"); - printf(" -F <file> File to pin an BPF cgroup array\n"); - printf(" -U <file> Update an already pinned BPF cgroup array\n"); - printf(" -v <value> Full path of the cgroup2\n"); - printf(" -h Display this help\n"); -} - -int main(int argc, char **argv) -{ - const char *pinned_file = NULL, *cg2 = NULL; - int create_array = 1; - int array_key = 0; - int array_fd = -1; - int cg2_fd = -1; - int ret = -1; - int opt; - - while ((opt = getopt(argc, argv, "F:U:v:")) != -1) { - switch (opt) { - /* General args */ - case 'F': - pinned_file = optarg; - break; - case 'U': - pinned_file = optarg; - create_array = 0; - break; - case 'v': - cg2 = optarg; - break; - default: - usage(); - goto out; - } - } - - if (!cg2 || !pinned_file) { - usage(); - goto out; - } - - cg2_fd = open(cg2, O_RDONLY); - if (cg2_fd < 0) { - fprintf(stderr, "open(%s,...): %s(%d)\n", - cg2, strerror(errno), errno); - goto out; - } - - if (create_array) { - array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, - sizeof(uint32_t), sizeof(uint32_t), - 1, 0); - if (array_fd < 0) { - fprintf(stderr, - "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", - strerror(errno), errno); - goto out; - } - } else { - array_fd = bpf_obj_get(pinned_file); - if (array_fd < 0) { - fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", - pinned_file, strerror(errno), errno); - goto out; - } - } - - ret = bpf_map_update_elem(array_fd, &array_key, &cg2_fd, 0); - if (ret) { - perror("bpf_map_update_elem"); - goto out; - } - - if (create_array) { - ret = bpf_obj_pin(array_fd, pinned_file); - if (ret) { - fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n", - pinned_file, strerror(errno), errno); - goto out; - } - } - -out: - if (array_fd != -1) - close(array_fd); - if (cg2_fd != -1) - close(cg2_fd); - return ret; -} diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c deleted file mode 100644 index 4bfcaf93fcf3..000000000000 --- a/samples/bpf/test_cgrp2_attach.c +++ /dev/null @@ -1,171 +0,0 @@ -/* eBPF example program: - * - * - Creates arraymap in kernel with 4 bytes keys and 8 byte values - * - * - Loads eBPF program - * - * The eBPF program accesses the map passed in to store two pieces of - * information. The number of invocations of the program, which maps - * to the number of packets received, is stored to key 0. Key 1 is - * incremented on each iteration by the number of bytes stored in - * the skb. - * - * - Attaches the new program to a cgroup using BPF_PROG_ATTACH - * - * - Every second, reads map[0] and map[1] to see how many bytes and - * packets were seen on any socket of tasks in the given cgroup. - */ - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <stddef.h> -#include <string.h> -#include <unistd.h> -#include <assert.h> -#include <errno.h> -#include <fcntl.h> - -#include <linux/bpf.h> - -#include "libbpf.h" - -enum { - MAP_KEY_PACKETS, - MAP_KEY_BYTES, -}; - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -static int prog_load(int map_fd, int verdict) -{ - struct bpf_insn prog[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */ - - /* Count packets */ - BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ - BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ - - /* Count bytes */ - BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ - BPF_LD_MAP_FD(BPF_REG_1, map_fd), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ - - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); -} - -static int usage(const char *argv0) -{ - printf("Usage: %s [-d] [-D] <cg-path> <egress|ingress>\n", argv0); - printf(" -d Drop Traffic\n"); - printf(" -D Detach filter, and exit\n"); - return EXIT_FAILURE; -} - -static int attach_filter(int cg_fd, int type, int verdict) -{ - int prog_fd, map_fd, ret, key; - long long pkt_cnt, byte_cnt; - - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, - sizeof(key), sizeof(byte_cnt), - 256, 0); - if (map_fd < 0) { - printf("Failed to create map: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - prog_fd = prog_load(map_fd, verdict); - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); - - if (prog_fd < 0) { - printf("Failed to load prog: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - ret = bpf_prog_attach(prog_fd, cg_fd, type, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; - } - while (1) { - key = MAP_KEY_PACKETS; - assert(bpf_map_lookup_elem(map_fd, &key, &pkt_cnt) == 0); - - key = MAP_KEY_BYTES; - assert(bpf_map_lookup_elem(map_fd, &key, &byte_cnt) == 0); - - printf("cgroup received %lld packets, %lld bytes\n", - pkt_cnt, byte_cnt); - sleep(1); - } - - return EXIT_SUCCESS; -} - -int main(int argc, char **argv) -{ - int detach_only = 0, verdict = 1; - enum bpf_attach_type type; - int opt, cg_fd, ret; - - while ((opt = getopt(argc, argv, "Dd")) != -1) { - switch (opt) { - case 'd': - verdict = 0; - break; - case 'D': - detach_only = 1; - break; - default: - return usage(argv[0]); - } - } - - if (argc - optind < 2) - return usage(argv[0]); - - if (strcmp(argv[optind + 1], "ingress") == 0) - type = BPF_CGROUP_INET_INGRESS; - else if (strcmp(argv[optind + 1], "egress") == 0) - type = BPF_CGROUP_INET_EGRESS; - else - return usage(argv[0]); - - cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY); - if (cg_fd < 0) { - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - if (detach_only) { - ret = bpf_prog_detach(cg_fd, type); - printf("bpf_prog_detach() returned '%s' (%d)\n", - strerror(errno), errno); - } else - ret = attach_filter(cg_fd, type, verdict); - - return ret; -} diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c deleted file mode 100644 index 3049b1f26267..000000000000 --- a/samples/bpf/test_cgrp2_attach2.c +++ /dev/null @@ -1,196 +0,0 @@ -/* eBPF example program: - * - * - Creates arraymap in kernel with 4 bytes keys and 8 byte values - * - * - Loads eBPF program - * - * The eBPF program accesses the map passed in to store two pieces of - * information. The number of invocations of the program, which maps - * to the number of packets received, is stored to key 0. Key 1 is - * incremented on each iteration by the number of bytes stored in - * the skb. - * - * - Attaches the new program to a cgroup using BPF_PROG_ATTACH - * - * - Every second, reads map[0] and map[1] to see how many bytes and - * packets were seen on any socket of tasks in the given cgroup. - */ - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <unistd.h> - -#include <linux/bpf.h> - -#include "libbpf.h" -#include "cgroup_helpers.h" - -#define FOO "/foo" -#define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -static int prog_load(int verdict) -{ - int ret; - struct bpf_insn prog[] = { - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - - if (ret < 0) { - log_err("Loading program"); - printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); - return 0; - } - return ret; -} - - -int main(int argc, char **argv) -{ - int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; - - allow_prog = prog_load(1); - if (!allow_prog) - goto err; - - drop_prog = prog_load(0); - if (!drop_prog) - goto err; - - if (setup_cgroup_environment()) - goto err; - - /* Create cgroup /foo, get fd, and join it */ - foo = create_and_get_cgroup(FOO); - if (!foo) - goto err; - - if (join_cgroup(FOO)) - goto err; - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { - log_err("Attaching prog to /foo"); - goto err; - } - - printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); - assert(system(PING_CMD) != 0); - - /* Create cgroup /foo/bar, get fd, and join it */ - bar = create_and_get_cgroup(BAR); - if (!bar) - goto err; - - if (join_cgroup(BAR)) - goto err; - - printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" - "This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo"); - goto err; - } - - printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" - "This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - errno = 0; - log_err("Unexpected success in double detach from /foo"); - goto err; - } - - if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching non-overridable prog to /foo"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching non-overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo"); - goto err; - } - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching different non-overridable prog to /foo"); - goto err; - } - - goto out; - -err: - rc = 1; - -out: - close(foo); - close(bar); - cleanup_cgroup_environment(); - if (!rc) - printf("PASS\n"); - else - printf("FAIL\n"); - return rc; -} diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c deleted file mode 100644 index c3cfb23e23b5..000000000000 --- a/samples/bpf/test_cgrp2_sock.c +++ /dev/null @@ -1,86 +0,0 @@ -/* eBPF example program: - * - * - Loads eBPF program - * - * The eBPF program sets the sk_bound_dev_if index in new AF_INET{6} - * sockets opened by processes in the cgroup. - * - * - Attaches the new program to a cgroup using BPF_PROG_ATTACH - */ - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <stddef.h> -#include <string.h> -#include <unistd.h> -#include <assert.h> -#include <errno.h> -#include <fcntl.h> -#include <net/if.h> -#include <linux/bpf.h> - -#include "libbpf.h" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -static int prog_load(int idx) -{ - struct bpf_insn prog[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - BPF_MOV64_IMM(BPF_REG_3, idx), - BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)), - BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)), - BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); -} - -static int usage(const char *argv0) -{ - printf("Usage: %s cg-path device-index\n", argv0); - return EXIT_FAILURE; -} - -int main(int argc, char **argv) -{ - int cg_fd, prog_fd, ret; - unsigned int idx; - - if (argc < 2) - return usage(argv[0]); - - idx = if_nametoindex(argv[2]); - if (!idx) { - printf("Invalid device name\n"); - return EXIT_FAILURE; - } - - cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); - if (cg_fd < 0) { - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - prog_fd = prog_load(idx); - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); - - if (prog_fd < 0) { - printf("Failed to load prog: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh deleted file mode 100755 index 925fd467c7cc..000000000000 --- a/samples/bpf/test_cgrp2_sock.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -function config_device { - ip netns add at_ns0 - ip link add veth0 type veth peer name veth0b - ip link set veth0b up - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip link add foo type vrf table 1234 - ip link set foo up - ip addr add 172.16.1.101/24 dev veth0b - ip addr add 2401:db00::2/64 dev veth0b nodad - ip link set veth0b master foo -} - -function attach_bpf { - rm -rf /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2 - mount -t cgroup2 none /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2/foo - test_cgrp2_sock /tmp/cgroupv2/foo foo - echo $$ >> /tmp/cgroupv2/foo/cgroup.procs -} - -function cleanup { - set +ex - ip netns delete at_ns0 - ip link del veth0 - ip link del foo - umount /tmp/cgroupv2 - rm -rf /tmp/cgroupv2 - set -ex -} - -function do_test { - ping -c1 -w1 172.16.1.100 - ping6 -c1 -w1 2401:db00::1 -} - -cleanup 2>/dev/null -config_device -attach_bpf -do_test -cleanup -echo "*** PASS ***" diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c deleted file mode 100644 index db036077b644..000000000000 --- a/samples/bpf/test_cgrp2_sock2.c +++ /dev/null @@ -1,66 +0,0 @@ -/* eBPF example program: - * - * - Loads eBPF program - * - * The eBPF program loads a filter from file and attaches the - * program to a cgroup using BPF_PROG_ATTACH - */ - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <stddef.h> -#include <string.h> -#include <unistd.h> -#include <assert.h> -#include <errno.h> -#include <fcntl.h> -#include <net/if.h> -#include <linux/bpf.h> - -#include "libbpf.h" -#include "bpf_load.h" - -static int usage(const char *argv0) -{ - printf("Usage: %s cg-path filter-path [filter-id]\n", argv0); - return EXIT_FAILURE; -} - -int main(int argc, char **argv) -{ - int cg_fd, ret, filter_id = 0; - - if (argc < 3) - return usage(argv[0]); - - cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); - if (cg_fd < 0) { - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - if (load_bpf_file(argv[2])) - return EXIT_FAILURE; - - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); - - if (argc > 3) - filter_id = atoi(argv[3]); - - if (filter_id > prog_cnt) { - printf("Invalid program id; program not found in file\n"); - return EXIT_FAILURE; - } - - ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, - BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh deleted file mode 100755 index 891f12a0e26f..000000000000 --- a/samples/bpf/test_cgrp2_sock2.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -function config_device { - ip netns add at_ns0 - ip link add veth0 type veth peer name veth0b - ip link set veth0b up - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add 172.16.1.101/24 dev veth0b - ip addr add 2401:db00::2/64 dev veth0b nodad -} - -function config_cgroup { - rm -rf /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2 - mount -t cgroup2 none /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2/foo - echo $$ >> /tmp/cgroupv2/foo/cgroup.procs -} - - -function attach_bpf { - test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 - [ $? -ne 0 ] && exit 1 -} - -function cleanup { - ip link del veth0b - ip netns delete at_ns0 - umount /tmp/cgroupv2 - rm -rf /tmp/cgroupv2 -} - -cleanup 2>/dev/null - -set -e -config_device -config_cgroup -set +e - -# -# Test 1 - fail ping6 -# -attach_bpf 0 -ping -c1 -w1 172.16.1.100 -if [ $? -ne 0 ]; then - echo "ping failed when it should succeed" - cleanup - exit 1 -fi - -ping6 -c1 -w1 2401:db00::1 -if [ $? -eq 0 ]; then - echo "ping6 succeeded when it should not" - cleanup - exit 1 -fi - -# -# Test 2 - fail ping -# -attach_bpf 1 -ping6 -c1 -w1 2401:db00::1 -if [ $? -ne 0 ]; then - echo "ping6 failed when it should succeed" - cleanup - exit 1 -fi - -ping -c1 -w1 172.16.1.100 -if [ $? -eq 0 ]; then - echo "ping succeeded when it should not" - cleanup - exit 1 -fi - -cleanup -echo -echo "*** PASS ***" diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh deleted file mode 100755 index 0b119eeaf85c..000000000000 --- a/samples/bpf/test_cgrp2_tc.sh +++ /dev/null @@ -1,184 +0,0 @@ -#!/bin/bash - -MY_DIR=$(dirname $0) -# Details on the bpf prog -BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin' -BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o" -BPF_SECTION='filter' - -[ -z "$TC" ] && TC='tc' -[ -z "$IP" ] && IP='ip' - -# Names of the veth interface, net namespace...etc. -HOST_IFC='ve' -NS_IFC='vens' -NS='ns' - -find_mnt() { - cat /proc/mounts | \ - awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }' -} - -# Init cgroup2 vars -init_cgrp2_vars() { - CGRP2_ROOT=$(find_mnt cgroup2) - if [ -z "$CGRP2_ROOT" ] - then - CGRP2_ROOT='/mnt/cgroup2' - MOUNT_CGRP2="yes" - fi - CGRP2_TC="$CGRP2_ROOT/tc" - CGRP2_TC_LEAF="$CGRP2_TC/leaf" -} - -# Init bpf fs vars -init_bpf_fs_vars() { - local bpf_fs_root=$(find_mnt bpf) - [ -n "$bpf_fs_root" ] || return -1 - BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals" -} - -setup_cgrp2() { - case $1 in - start) - if [ "$MOUNT_CGRP2" == 'yes' ] - then - [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT - mount -t cgroup2 none $CGRP2_ROOT || return $? - fi - mkdir -p $CGRP2_TC_LEAF - ;; - *) - rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC - [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT - ;; - esac -} - -setup_bpf_cgrp2_array() { - local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME" - case $1 in - start) - $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC - ;; - *) - [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array - ;; - esac -} - -setup_net() { - case $1 in - start) - $IP link add $HOST_IFC type veth peer name $NS_IFC || return $? - $IP link set dev $HOST_IFC up || return $? - sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0 - - $IP netns add ns || return $? - $IP link set dev $NS_IFC netns ns || return $? - $IP -n $NS link set dev $NS_IFC up || return $? - $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0 - $TC qdisc add dev $HOST_IFC clsact || return $? - $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $? - ;; - *) - $IP netns del $NS - $IP link del $HOST_IFC - ;; - esac -} - -run_in_cgrp() { - # Fork another bash and move it under the specified cgroup. - # It makes the cgroup cleanup easier at the end of the test. - cmd='echo $$ > ' - cmd="$cmd $1/cgroup.procs; exec $2" - bash -c "$cmd" -} - -do_test() { - run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null" - local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \ - awk '/drop/{print substr($7, 0, index($7, ",")-1)}') - if [[ $dropped -eq 0 ]] - then - echo "FAIL" - return 1 - else - echo "Successfully filtered $dropped packets" - return 0 - fi -} - -do_exit() { - if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ] - then - echo "------ DEBUG ------" - echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo - echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo - if [ -d "$BPF_FS_TC_SHARE" ] - then - echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo - fi - echo "Host net:" - $IP netns - $IP link show dev $HOST_IFC - $IP -6 a show dev $HOST_IFC - $TC -s qdisc show dev $HOST_IFC - echo - echo "$NS net:" - $IP -n $NS link show dev $NS_IFC - $IP -n $NS -6 link show dev $NS_IFC - echo "------ DEBUG ------" - echo - fi - - if [ "$MODE" != 'nocleanup' ] - then - setup_net stop - setup_bpf_cgrp2_array stop - setup_cgrp2 stop - fi -} - -init_cgrp2_vars -init_bpf_fs_vars - -while [[ $# -ge 1 ]] -do - a="$1" - case $a in - debug) - DEBUG='yes' - shift 1 - ;; - cleanup-only) - MODE='cleanuponly' - shift 1 - ;; - no-cleanup) - MODE='nocleanup' - shift 1 - ;; - *) - echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]" - echo " debug: Print cgrp and network setup details at the end of the test" - echo " cleanup-only: Try to cleanup things from last test. No test will be run" - echo " no-cleanup: Run the test but don't do cleanup at the end" - echo "[Note: If no arg is given, it will run the test and do cleanup at the end]" - echo - exit -1 - ;; - esac -done - -trap do_exit 0 - -[ "$MODE" == 'cleanuponly' ] && exit - -setup_cgrp2 start || exit $? -setup_net start || exit $? -init_bpf_fs_vars || exit $? -setup_bpf_cgrp2_array start || exit $? -do_test -echo diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c deleted file mode 100644 index 1547b36a7b7b..000000000000 --- a/samples/bpf/test_cgrp2_tc_kern.c +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/if_ether.h> -#include <uapi/linux/in6.h> -#include <uapi/linux/ipv6.h> -#include <uapi/linux/pkt_cls.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -/* copy of 'struct ethhdr' without __packed */ -struct eth_hdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; - unsigned short h_proto; -}; - -#define PIN_GLOBAL_NS 2 -struct bpf_elf_map { - __u32 type; - __u32 size_key; - __u32 size_value; - __u32 max_elem; - __u32 flags; - __u32 id; - __u32 pinning; -}; - -struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = { - .type = BPF_MAP_TYPE_CGROUP_ARRAY, - .size_key = sizeof(uint32_t), - .size_value = sizeof(uint32_t), - .pinning = PIN_GLOBAL_NS, - .max_elem = 1, -}; - -SEC("filter") -int handle_egress(struct __sk_buff *skb) -{ - void *data = (void *)(long)skb->data; - struct eth_hdr *eth = data; - struct ipv6hdr *ip6h = data + sizeof(*eth); - void *data_end = (void *)(long)skb->data_end; - char dont_care_msg[] = "dont care %04x %d\n"; - char pass_msg[] = "pass\n"; - char reject_msg[] = "reject\n"; - - /* single length check */ - if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) - return TC_ACT_OK; - - if (eth->h_proto != htons(ETH_P_IPV6) || - ip6h->nexthdr != IPPROTO_ICMPV6) { - bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), - eth->h_proto, ip6h->nexthdr); - return TC_ACT_OK; - } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { - bpf_trace_printk(pass_msg, sizeof(pass_msg)); - return TC_ACT_OK; - } else { - bpf_trace_printk(reject_msg, sizeof(reject_msg)); - return TC_ACT_SHOT; - } -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh index 0365d5ee512c..aaddd67b37ff 100755 --- a/samples/bpf/test_cls_bpf.sh +++ b/samples/bpf/test_cls_bpf.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 function pktgen { ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \ diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c deleted file mode 100644 index 86b28d7d6c99..000000000000 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#include <linux/ptrace.h> -#include <uapi/linux/bpf.h> -#include <linux/version.h> -#include "bpf_helpers.h" -#include <uapi/linux/utsname.h> - -struct bpf_map_def SEC("maps") cgroup_map = { - .type = BPF_MAP_TYPE_CGROUP_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; - -struct bpf_map_def SEC("maps") perf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 1, -}; - -/* Writes the last PID that called sync to a map at index 0 */ -SEC("kprobe/sys_sync") -int bpf_prog1(struct pt_regs *ctx) -{ - u64 pid = bpf_get_current_pid_tgid(); - int idx = 0; - - if (!bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY); - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c deleted file mode 100644 index 65b5fb51c1db..000000000000 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#define _GNU_SOURCE -#include <stdio.h> -#include <linux/bpf.h> -#include <unistd.h> -#include "libbpf.h" -#include "bpf_load.h" -#include <linux/bpf.h> -#include "cgroup_helpers.h" - -#define CGROUP_PATH "/my-cgroup" - -int main(int argc, char **argv) -{ - pid_t remote_pid, local_pid = getpid(); - int cg2, idx = 0, rc = 0; - char filename[256]; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - - if (setup_cgroup_environment()) - goto err; - - cg2 = create_and_get_cgroup(CGROUP_PATH); - - if (!cg2) - goto err; - - if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) { - log_err("Adding target cgroup to map"); - goto err; - } - - if (join_cgroup(CGROUP_PATH)) - goto err; - - /* - * The installed helper program catched the sync call, and should - * write it to the map. - */ - - sync(); - bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); - - if (local_pid != remote_pid) { - fprintf(stderr, - "BPF Helper didn't write correct PID to map, but: %d\n", - remote_pid); - goto err; - } - - /* Verify the negative scenario; leave the cgroup */ - if (join_cgroup("/")) - goto err; - - remote_pid = 0; - bpf_map_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY); - - sync(); - bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); - - if (local_pid == remote_pid) { - fprintf(stderr, "BPF cgroup negative test did not work\n"); - goto err; - } - - goto out; -err: - rc = 1; - -out: - close(cg2); - cleanup_cgroup_environment(); - return rc; -} diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh deleted file mode 100755 index 196925403ab4..000000000000 --- a/samples/bpf/test_ipip.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/bash - -function config_device { - ip netns add at_ns0 - ip netns add at_ns1 - ip netns add at_ns2 - ip link add veth0 type veth peer name veth0b - ip link add veth1 type veth peer name veth1b - ip link add veth2 type veth peer name veth2b - ip link set veth0b up - ip link set veth1b up - ip link set veth2b up - ip link set dev veth0b mtu 1500 - ip link set dev veth1b mtu 1500 - ip link set dev veth2b mtu 1500 - ip link set veth0 netns at_ns0 - ip link set veth1 netns at_ns1 - ip link set veth2 netns at_ns2 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 - ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad - ip netns exec at_ns1 ip link set dev veth1 up - ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 - ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad - ip netns exec at_ns2 ip link set dev veth2 up - ip link add br0 type bridge - ip link set br0 up - ip link set dev br0 mtu 1500 - ip link set veth0b master br0 - ip link set veth1b master br0 - ip link set veth2b master br0 -} - -function add_ipip_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ipip external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ipip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ip6ip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 -} - -function attach_bpf { - DEV=$1 - SET_TUNNEL=$2 - GET_TUNNEL=$3 - ip netns exec at_ns2 tc qdisc add dev $DEV clsact - ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL - ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL -} - -function test_ipip { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip_tunnel - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv4 over IPv6 tunnel -function test_ipip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip6_tunnel - attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv6 over IPv6 tunnel -function test_ip6ip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ip6ip6_tunnel - attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel - - ip netns exec at_ns0 ping -6 -c 1 2601:646::2 - ip netns exec at_ns2 ping -6 -c 1 2601:646::1 - ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 - cleanup -} - -function cleanup { - set +ex - pkill iperf - ip netns delete at_ns0 - ip netns delete at_ns1 - ip netns delete at_ns2 - ip link del veth0 - ip link del veth1 - ip link del veth2 - ip link del br0 - pkill tcpdump - pkill cat - set -ex -} - -cleanup -echo "Testing IP tunnels..." -test_ipip -test_ipip6 -test_ip6ip6 -echo "*** PASS ***" diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index 73c357142268..1c161276d57b 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #define _GNU_SOURCE #include <linux/types.h> @@ -16,12 +13,11 @@ #include <sched.h> #include <sys/wait.h> #include <sys/stat.h> -#include <sys/resource.h> #include <fcntl.h> #include <stdlib.h> #include <time.h> -#include "libbpf.h" +#include <bpf/bpf.h> #include "bpf_util.h" #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -46,11 +42,6 @@ static inline void INIT_LIST_HEAD(struct list_head *list) list->prev = list; } -static inline int list_empty(const struct list_head *head) -{ - return head->next == head; -} - static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) @@ -108,10 +99,10 @@ struct pfect_lru { static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, unsigned int nr_possible_elems) { - lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + lru->map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(unsigned long long), sizeof(struct pfect_lru_node *), - nr_possible_elems, 0); + nr_possible_elems, NULL); assert(lru->map_fd != -1); lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); @@ -210,10 +201,13 @@ static unsigned int read_keys(const char *dist_file, static int create_map(int map_type, int map_flags, unsigned int size) { + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = map_flags, + ); int map_fd; - map_fd = bpf_create_map(map_type, sizeof(unsigned long long), - sizeof(unsigned long long), size, map_flags); + map_fd = bpf_map_create(map_type, NULL, sizeof(unsigned long long), + sizeof(unsigned long long), size, &opts); if (map_fd == -1) perror("bpf_create_map"); @@ -492,7 +486,6 @@ static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; const char *dist_file; int nr_tasks = 1; @@ -511,8 +504,6 @@ int main(int argc, char **argv) setbuf(stdout, NULL); - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); - srand(time(NULL)); nr_cpus = bpf_num_possible_cpus(); @@ -527,7 +518,7 @@ int main(int argc, char **argv) return -1; } - for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { + for (f = 0; f < ARRAY_SIZE(map_flags); f++) { test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f], diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c index bacc8013436b..9a13dbb81847 100644 --- a/samples/bpf/test_lwt_bpf.c +++ b/samples/bpf/test_lwt_bpf.c @@ -10,17 +10,9 @@ * General Public License for more details. */ -#include <stdint.h> -#include <stddef.h> -#include <linux/bpf.h> -#include <linux/ip.h> -#include <linux/in.h> -#include <linux/in6.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmpv6.h> -#include <linux/if_ether.h> -#include "bpf_helpers.h" +#include "vmlinux.h" +#include "net_shared.h" +#include <bpf/bpf_helpers.h> #include <string.h> # define printk(fmt, ...) \ @@ -44,9 +36,9 @@ SEC("test_ctx") int do_test_ctx(struct __sk_buff *skb) { skb->cb[0] = CB_MAGIC; - printk("len %d hash %d protocol %d\n", skb->len, skb->hash, + printk("len %d hash %d protocol %d", skb->len, skb->hash, skb->protocol); - printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0], + printk("cb %d ingress_ifindex %d ifindex %d", skb->cb[0], skb->ingress_ifindex, skb->ifindex); return BPF_OK; @@ -56,9 +48,9 @@ int do_test_ctx(struct __sk_buff *skb) SEC("test_cb") int do_test_cb(struct __sk_buff *skb) { - printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1], + printk("cb0: %x cb1: %x cb2: %x", skb->cb[0], skb->cb[1], skb->cb[2]); - printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]); + printk("cb3: %x cb4: %x", skb->cb[3], skb->cb[4]); return BPF_OK; } @@ -72,11 +64,11 @@ int do_test_data(struct __sk_buff *skb) struct iphdr *iph = data; if (data + sizeof(*iph) > data_end) { - printk("packet truncated\n"); + printk("packet truncated"); return BPF_DROP; } - printk("src: %x dst: %x\n", iph->saddr, iph->daddr); + printk("src: %x dst: %x", iph->saddr, iph->daddr); return BPF_OK; } @@ -97,7 +89,7 @@ static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip, ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1); if (ret < 0) { - printk("bpf_l4_csum_replace failed: %d\n", ret); + printk("bpf_l4_csum_replace failed: %d", ret); return BPF_DROP; } @@ -120,14 +112,14 @@ static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip, ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip, flags | sizeof(new_ip)); if (ret < 0) { - printk("bpf_l4_csum_replace failed: %d\n"); + printk("bpf_l4_csum_replace failed: %d"); return BPF_DROP; } } ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); if (ret < 0) { - printk("bpf_l3_csum_replace failed: %d\n", ret); + printk("bpf_l3_csum_replace failed: %d", ret); return BPF_DROP; } @@ -137,7 +129,7 @@ static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip, ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); if (ret < 0) { - printk("bpf_skb_store_bytes() failed: %d\n", ret); + printk("bpf_skb_store_bytes() failed: %d", ret); return BPF_DROP; } @@ -153,12 +145,12 @@ int do_test_rewrite(struct __sk_buff *skb) ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4); if (ret < 0) { - printk("bpf_skb_load_bytes failed: %d\n", ret); + printk("bpf_skb_load_bytes failed: %d", ret); return BPF_DROP; } if (old_ip == 0x2fea8c0) { - printk("out: rewriting from %x to %x\n", old_ip, new_ip); + printk("out: rewriting from %x to %x", old_ip, new_ip); return rewrite(skb, old_ip, new_ip, 1); } @@ -173,16 +165,16 @@ static inline int __do_push_ll_and_redirect(struct __sk_buff *skb) ret = bpf_skb_change_head(skb, 14, 0); if (ret < 0) { - printk("skb_change_head() failed: %d\n", ret); + printk("skb_change_head() failed: %d", ret); } - ehdr.h_proto = __constant_htons(ETH_P_IP); + ehdr.h_proto = bpf_htons(ETH_P_IP); memcpy(&ehdr.h_source, &smac, 6); memcpy(&ehdr.h_dest, &dmac, 6); ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0); if (ret < 0) { - printk("skb_store_bytes() failed: %d\n", ret); + printk("skb_store_bytes() failed: %d", ret); return BPF_DROP; } @@ -202,7 +194,7 @@ int do_push_ll_and_redirect(struct __sk_buff *skb) ret = __do_push_ll_and_redirect(skb); if (ret >= 0) - printk("redirected to %d\n", ifindex); + printk("redirected to %d", ifindex); return ret; } @@ -229,7 +221,7 @@ SEC("fill_garbage") int do_fill_garbage(struct __sk_buff *skb) { __fill_garbage(skb); - printk("Set initial 96 bytes of header to FF\n"); + printk("Set initial 96 bytes of header to FF"); return BPF_OK; } @@ -238,7 +230,7 @@ int do_fill_garbage_and_redirect(struct __sk_buff *skb) { int ifindex = DST_IFINDEX; __fill_garbage(skb); - printk("redirected to %d\n", ifindex); + printk("redirected to %d", ifindex); return bpf_redirect(ifindex, 0); } @@ -246,7 +238,7 @@ int do_fill_garbage_and_redirect(struct __sk_buff *skb) SEC("drop_all") int do_drop_all(struct __sk_buff *skb) { - printk("dropping with: %d\n", BPF_DROP); + printk("dropping with: %d", BPF_DROP); return BPF_DROP; } diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh index a695ae268c40..148e2df6cdce 100644..100755 --- a/samples/bpf/test_lwt_bpf.sh +++ b/samples/bpf/test_lwt_bpf.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # Uncomment to see generated bytecode #VERBOSE=verbose @@ -18,7 +19,10 @@ IPVETH3="192.168.111.2" IP_LOCAL="192.168.99.1" -TRACE_ROOT=/sys/kernel/debug/tracing +PROG_SRC="test_lwt_bpf.c" +BPF_PROG="test_lwt_bpf.o" +TRACE_ROOT=/sys/kernel/tracing +CONTEXT_INFO=$(cat ${TRACE_ROOT}/trace_options | grep context) function lookup_mac() { @@ -35,7 +39,7 @@ function lookup_mac() function cleanup { set +ex - rm test_lwt_bpf.o 2> /dev/null + rm $BPF_PROG 2> /dev/null ip link del $VETH0 2> /dev/null ip link del $VETH1 2> /dev/null ip link del $VETH2 2> /dev/null @@ -75,7 +79,7 @@ function install_test { cleanup_routes cp /dev/null ${TRACE_ROOT}/trace - OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE" + OPTS="encap bpf headroom 14 $1 obj $BPF_PROG section $2 $VERBOSE" if [ "$1" == "in" ]; then ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo @@ -95,7 +99,7 @@ function remove_prog { function filter_trace { # Add newline to allow starting EXPECT= variables on newline NL=$'\n' - echo "${NL}$*" | sed -e 's/^.*: : //g' + echo "${NL}$*" | sed -e 's/bpf_trace_printk: //g' } function expect_fail { @@ -159,11 +163,11 @@ function test_ctx_out { failure "test_ctx out: packets are dropped" } match_trace "$(get_trace)" " -len 84 hash 0 protocol 0 +len 84 hash 0 protocol 8 cb 1234 ingress_ifindex 0 ifindex 0 -len 84 hash 0 protocol 0 +len 84 hash 0 protocol 8 cb 1234 ingress_ifindex 0 ifindex 0 -len 84 hash 0 protocol 0 +len 84 hash 0 protocol 8 cb 1234 ingress_ifindex 0 ifindex 0" || exit 1 remove_prog out } @@ -366,14 +370,15 @@ setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3 ip netns exec $NS1 netserver echo 1 > ${TRACE_ROOT}/tracing_on +echo nocontext-info > ${TRACE_ROOT}/trace_options DST_MAC=$(lookup_mac $VETH1 $NS1) SRC_MAC=$(lookup_mac $VETH0) DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex) -CLANG_OPTS="-O2 -target bpf -I ../include/" +CLANG_OPTS="-O2 --target=bpf -I ../include/" CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX" -clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o +clang $CLANG_OPTS -c $PROG_SRC -o $BPF_PROG test_ctx_xmit test_ctx_out @@ -396,4 +401,5 @@ test_netperf_redirect cleanup echo 0 > ${TRACE_ROOT}/tracing_on +echo $CONTEXT_INFO > ${TRACE_ROOT}/trace_options exit 0 diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map.bpf.c index 42c44d091dd1..9f030f9c4e1b 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map.bpf.c @@ -6,69 +6,72 @@ * License as published by the Free Software Foundation. */ #define KBUILD_MODNAME "foo" -#include <linux/ptrace.h> +#include "vmlinux.h" #include <linux/version.h> -#include <uapi/linux/bpf.h> -#include <uapi/linux/in6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> #define MAX_NR_PORTS 65536 +#define EINVAL 22 +#define ENOENT 2 + /* map #0 */ -struct bpf_map_def SEC("maps") port_a = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = MAX_NR_PORTS, -}; +struct inner_a { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, int); + __uint(max_entries, MAX_NR_PORTS); +} port_a SEC(".maps"); /* map #1 */ -struct bpf_map_def SEC("maps") port_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct inner_h { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} port_h SEC(".maps"); /* map #2 */ -struct bpf_map_def SEC("maps") reg_result_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} reg_result_h SEC(".maps"); /* map #3 */ -struct bpf_map_def SEC("maps") inline_result_h = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(int), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, int); + __uint(max_entries, 1); +} inline_result_h SEC(".maps"); /* map #4 */ /* Test case #0 */ -struct bpf_map_def SEC("maps") a_of_port_a = { - .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 0, /* map_fd[0] is port_a */ - .max_entries = MAX_NR_PORTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_NR_PORTS); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} a_of_port_a SEC(".maps"); /* map #5 */ /* Test case #1 */ -struct bpf_map_def SEC("maps") h_of_port_a = { - .type = BPF_MAP_TYPE_HASH_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 0, /* map_fd[0] is port_a */ - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_a); /* use inner_a as inner map */ +} h_of_port_a SEC(".maps"); /* map #6 */ /* Test case #2 */ -struct bpf_map_def SEC("maps") h_of_port_h = { - .type = BPF_MAP_TYPE_HASH_OF_MAPS, - .key_size = sizeof(u32), - .inner_map_idx = 1, /* map_fd[1] is port_h */ - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(u32)); + __array(values, struct inner_h); /* use inner_h as inner map */ +} h_of_port_h SEC(".maps"); static __always_inline int do_reg_lookup(void *inner_map, u32 port) { @@ -100,23 +103,19 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/sys_connect") -int trace_sys_connect(struct pt_regs *ctx) +SEC("ksyscall/connect") +int BPF_KSYSCALL(trace_sys_connect, unsigned int fd, struct sockaddr_in6 *in6, int addrlen) { - struct sockaddr_in6 *in6; u16 test_case, port, dst6[8]; - int addrlen, ret, inline_ret, ret_key = 0; + int ret, inline_ret, ret_key = 0; u32 port_key; void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); - addrlen = (int)PT_REGS_PARM3(ctx); - if (addrlen != sizeof(*in6)) return 0; - ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + ret = bpf_probe_read_user(dst6, sizeof(dst6), &in6->sin6_addr); if (ret) { inline_ret = ret; goto done; @@ -127,7 +126,7 @@ int trace_sys_connect(struct pt_regs *ctx) test_case = dst6[7]; - ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port); + ret = bpf_probe_read_user(&port, sizeof(port), &in6->sin6_port); if (ret) { inline_ret = ret; goto done; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c index 1aca18539d8d..55dca43f3723 100644 --- a/samples/bpf/test_map_in_map_user.c +++ b/samples/bpf/test_map_in_map_user.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ -#include <sys/resource.h> #include <sys/socket.h> #include <arpa/inet.h> #include <stdint.h> @@ -13,8 +9,12 @@ #include <errno.h> #include <stdlib.h> #include <stdio.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_util.h" + +static int map_fd[7]; #define PORT_A (map_fd[0]) #define PORT_H (map_fd[1]) @@ -30,7 +30,7 @@ static const char * const test_names[] = { "Hash of Hash", }; -#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) +#define NR_TESTS ARRAY_SIZE(test_names) static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key) { @@ -38,7 +38,7 @@ static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key) uint32_t info_len = sizeof(info); int ret, id; - ret = bpf_obj_get_info_by_fd(inner_map_fd, &info, &info_len); + ret = bpf_map_get_info_by_fd(inner_map_fd, &info, &info_len); assert(!ret); ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id); @@ -115,19 +115,54 @@ static void test_map_in_map(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "trace_sys_connect"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h"); + map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h"); + map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a"); + map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a"); + map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 || + map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } test_map_in_map(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c deleted file mode 100644 index 468a66a92ef9..000000000000 --- a/samples/bpf/test_overhead_kprobe_kern.c +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/version.h> -#include <linux/ptrace.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) - -SEC("kprobe/__set_task_comm") -int prog(struct pt_regs *ctx) -{ - struct signal_struct *signal; - struct task_struct *tsk; - char oldcomm[16] = {}; - char newcomm[16] = {}; - u16 oom_score_adj; - u32 pid; - - tsk = (void *)PT_REGS_PARM1(ctx); - - pid = _(tsk->pid); - bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm); - bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx)); - signal = _(tsk->signal); - oom_score_adj = _(signal->oom_score_adj); - return 0; -} - -SEC("kprobe/urandom_read") -int prog2(struct pt_regs *ctx) -{ - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c deleted file mode 100644 index 38f5c0b9da9f..000000000000 --- a/samples/bpf/test_overhead_tp_kern.c +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -/* from /sys/kernel/debug/tracing/events/task/task_rename/format */ -struct task_rename { - __u64 pad; - __u32 pid; - char oldcomm[16]; - char newcomm[16]; - __u16 oom_score_adj; -}; -SEC("tracepoint/task/task_rename") -int prog(struct task_rename *ctx) -{ - return 0; -} - -/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */ -struct urandom_read { - __u64 pad; - int got_bits; - int pool_left; - int input_left; -}; -SEC("tracepoint/random/urandom_read") -int prog2(struct urandom_read *ctx) -{ - return 0; -} -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c deleted file mode 100644 index d291167fd3c7..000000000000 --- a/samples/bpf/test_overhead_user.c +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#define _GNU_SOURCE -#include <sched.h> -#include <stdio.h> -#include <sys/types.h> -#include <asm/unistd.h> -#include <fcntl.h> -#include <unistd.h> -#include <assert.h> -#include <sys/wait.h> -#include <stdlib.h> -#include <signal.h> -#include <linux/bpf.h> -#include <string.h> -#include <time.h> -#include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" - -#define MAX_CNT 1000000 - -static __u64 time_get_ns(void) -{ - struct timespec ts; - - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts.tv_sec * 1000000000ull + ts.tv_nsec; -} - -static void test_task_rename(int cpu) -{ - __u64 start_time; - char buf[] = "test\n"; - int i, fd; - - fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); - if (fd < 0) { - printf("couldn't open /proc\n"); - exit(1); - } - start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) - write(fd, buf, sizeof(buf)); - printf("task_rename:%d: %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); - close(fd); -} - -static void test_urandom_read(int cpu) -{ - __u64 start_time; - char buf[4]; - int i, fd; - - fd = open("/dev/urandom", O_RDONLY); - if (fd < 0) { - printf("couldn't open /dev/urandom\n"); - exit(1); - } - start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) - read(fd, buf, sizeof(buf)); - printf("urandom_read:%d: %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); - close(fd); -} - -static void loop(int cpu, int flags) -{ - cpu_set_t cpuset; - - CPU_ZERO(&cpuset); - CPU_SET(cpu, &cpuset); - sched_setaffinity(0, sizeof(cpuset), &cpuset); - - if (flags & 1) - test_task_rename(cpu); - if (flags & 2) - test_urandom_read(cpu); -} - -static void run_perf_test(int tasks, int flags) -{ - pid_t pid[tasks]; - int i; - - for (i = 0; i < tasks; i++) { - pid[i] = fork(); - if (pid[i] == 0) { - loop(i, flags); - exit(0); - } else if (pid[i] == -1) { - printf("couldn't spawn #%d process\n", i); - exit(1); - } - } - for (i = 0; i < tasks; i++) { - int status; - - assert(waitpid(pid[i], &status, 0) == pid[i]); - assert(status == 0); - } -} - -static void unload_progs(void) -{ - close(prog_fd[0]); - close(prog_fd[1]); - close(event_fd[0]); - close(event_fd[1]); -} - -int main(int argc, char **argv) -{ - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int num_cpu = 8; - int test_flags = ~0; - - setrlimit(RLIMIT_MEMLOCK, &r); - - if (argc > 1) - test_flags = atoi(argv[1]) ? : test_flags; - if (argc > 2) - num_cpu = atoi(argv[2]) ? : num_cpu; - - if (test_flags & 0x3) { - printf("BASE\n"); - run_perf_test(num_cpu, test_flags); - } - - if (test_flags & 0xC) { - snprintf(filename, sizeof(filename), - "%s_kprobe_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - printf("w/KPROBE\n"); - run_perf_test(num_cpu, test_flags >> 2); - unload_progs(); - } - - if (test_flags & 0x30) { - snprintf(filename, sizeof(filename), - "%s_tp_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - printf("w/TRACEPOINT\n"); - run_perf_test(num_cpu, test_flags >> 4); - unload_progs(); - } - - return 0; -} diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c deleted file mode 100644 index 3a677c807044..000000000000 --- a/samples/bpf/test_probe_write_user_kern.c +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <uapi/linux/bpf.h> -#include <linux/version.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") dnat_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct sockaddr_in), - .value_size = sizeof(struct sockaddr_in), - .max_entries = 256, -}; - -/* kprobe is NOT a stable ABI - * kernel functions can be removed, renamed or completely change semantics. - * Number of arguments and their positions can change, etc. - * In such case this bpf+kprobe example will no longer be meaningful - * - * This example sits on a syscall, and the syscall ABI is relatively stable - * of course, across platforms, and over time, the ABI may change. - */ -SEC("kprobe/sys_connect") -int bpf_prog1(struct pt_regs *ctx) -{ - struct sockaddr_in new_addr, orig_addr = {}; - struct sockaddr_in *mapped_addr; - void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx); - int sockaddr_len = (int)PT_REGS_PARM3(ctx); - - if (sockaddr_len > sizeof(orig_addr)) - return 0; - - if (bpf_probe_read(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) - return 0; - - mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); - if (mapped_addr != NULL) { - memcpy(&new_addr, mapped_addr, sizeof(new_addr)); - bpf_probe_write_user(sockaddr_arg, &new_addr, - sizeof(new_addr)); - } - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c deleted file mode 100644 index b5bf178a6ecc..000000000000 --- a/samples/bpf/test_probe_write_user_user.c +++ /dev/null @@ -1,78 +0,0 @@ -#include <stdio.h> -#include <assert.h> -#include <linux/bpf.h> -#include <unistd.h> -#include "libbpf.h" -#include "bpf_load.h" -#include <sys/socket.h> -#include <string.h> -#include <netinet/in.h> -#include <arpa/inet.h> - -int main(int ac, char **argv) -{ - int serverfd, serverconnfd, clientfd; - socklen_t sockaddr_len; - struct sockaddr serv_addr, mapped_addr, tmp_addr; - struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; - char filename[256]; - char *ip; - - serv_addr_in = (struct sockaddr_in *)&serv_addr; - mapped_addr_in = (struct sockaddr_in *)&mapped_addr; - tmp_addr_in = (struct sockaddr_in *)&tmp_addr; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - - assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); - assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); - - /* Bind server to ephemeral port on lo */ - memset(&serv_addr, 0, sizeof(serv_addr)); - serv_addr_in->sin_family = AF_INET; - serv_addr_in->sin_port = 0; - serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - - assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0); - - sockaddr_len = sizeof(serv_addr); - assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0); - ip = inet_ntoa(serv_addr_in->sin_addr); - printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port)); - - memset(&mapped_addr, 0, sizeof(mapped_addr)); - mapped_addr_in->sin_family = AF_INET; - mapped_addr_in->sin_port = htons(5555); - mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); - - assert(!bpf_map_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY)); - - assert(listen(serverfd, 5) == 0); - - ip = inet_ntoa(mapped_addr_in->sin_addr); - printf("Client connecting to: %s:%d\n", - ip, ntohs(mapped_addr_in->sin_port)); - assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0); - - sockaddr_len = sizeof(tmp_addr); - ip = inet_ntoa(tmp_addr_in->sin_addr); - assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0); - printf("Server received connection from: %s:%d\n", - ip, ntohs(tmp_addr_in->sin_port)); - - sockaddr_len = sizeof(tmp_addr); - assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0); - ip = inet_ntoa(tmp_addr_in->sin_addr); - printf("Client's peer address: %s:%d\n", - ip, ntohs(tmp_addr_in->sin_port)); - - /* Is the server's getsockname = the socket getpeername */ - assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); - - return 0; -} diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh deleted file mode 100755 index a70d2ea90313..000000000000 --- a/samples/bpf/test_tunnel_bpf.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# In Namespace 0 (at_ns0) using native tunnel -# Overlay IP: 10.1.1.100 -# local 192.16.1.100 remote 192.16.1.200 -# veth0 IP: 172.16.1.100, tunnel dev <type>00 - -# Out of Namespace using BPF set/get on lwtunnel -# Overlay IP: 10.1.1.200 -# local 172.16.1.200 remote 172.16.1.100 -# veth1 IP: 172.16.1.200, tunnel dev <type>11 - -function config_device { - ip netns add at_ns0 - ip link add veth0 type veth peer name veth1 - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip link set dev veth1 up mtu 1500 - ip addr add dev veth1 172.16.1.200/24 -} - -function add_gre_tunnel { - # in namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE key 2 local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # out of namespace - ip link add dev $DEV type $TYPE key 2 external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -function add_vxlan_tunnel { - # Set static ARP entry here because iptables set-mark works - # on L3 packet, as a result not applying to ARP packets, - # causing errors at get_tunnel_{key/opt}. - - # in namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE id 2 dstport 4789 gbp remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS address 52:54:00:d9:01:00 up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 arp -s 10.1.1.200 52:54:00:d9:02:00 - ip netns exec at_ns0 iptables -A OUTPUT -j MARK --set-mark 0x800FF - - # out of namespace - ip link add dev $DEV type $TYPE external gbp dstport 4789 - ip link set dev $DEV address 52:54:00:d9:02:00 up - ip addr add dev $DEV 10.1.1.200/24 - arp -s 10.1.1.100 52:54:00:d9:01:00 -} - -function add_geneve_tunnel { - # in namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE id 2 dstport 6081 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # out of namespace - ip link add dev $DEV type $TYPE dstport 6081 external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ipip_tunnel { - # in namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # out of namespace - ip link add dev $DEV type $TYPE external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -function attach_bpf { - DEV=$1 - SET_TUNNEL=$2 - GET_TUNNEL=$3 - tc qdisc add dev $DEV clsact - tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL - tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL -} - -function test_gre { - TYPE=gretap - DEV_NS=gretap00 - DEV=gretap11 - config_device - add_gre_tunnel - attach_bpf $DEV gre_set_tunnel gre_get_tunnel - ping -c 1 10.1.1.100 - ip netns exec at_ns0 ping -c 1 10.1.1.200 - cleanup -} - -function test_vxlan { - TYPE=vxlan - DEV_NS=vxlan00 - DEV=vxlan11 - config_device - add_vxlan_tunnel - attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel - ping -c 1 10.1.1.100 - ip netns exec at_ns0 ping -c 1 10.1.1.200 - cleanup -} - -function test_geneve { - TYPE=geneve - DEV_NS=geneve00 - DEV=geneve11 - config_device - add_geneve_tunnel - attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel - ping -c 1 10.1.1.100 - ip netns exec at_ns0 ping -c 1 10.1.1.200 - cleanup -} - -function test_ipip { - TYPE=ipip - DEV_NS=ipip00 - DEV=ipip11 - config_device - tcpdump -nei veth1 & - cat /sys/kernel/debug/tracing/trace_pipe & - add_ipip_tunnel - ethtool -K veth1 gso off gro off rx off tx off - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - ping -c 1 10.1.1.100 - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - sleep 0.2 - iperf -c 10.1.1.100 -n 5k -p 5200 - cleanup -} - -function cleanup { - set +ex - pkill iperf - ip netns delete at_ns0 - ip link del veth1 - ip link del ipip11 - ip link del gretap11 - ip link del vxlan11 - ip link del geneve11 - pkill tcpdump - pkill cat - set -ex -} - -cleanup -echo "Testing GRE tunnel..." -test_gre -echo "Testing VXLAN tunnel..." -test_vxlan -echo "Testing GENEVE tunnel..." -test_geneve -echo "Testing IPIP tunnel..." -test_ipip -echo "*** PASS ***" diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 41b6115a32eb..0bba5fcd7d24 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -5,11 +5,11 @@ * License as published by the Free Software Foundation. */ #include <linux/ptrace.h> -#include <linux/version.h> #include <uapi/linux/bpf.h> #include <uapi/linux/bpf_perf_event.h> #include <uapi/linux/perf_event.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct key_t { char comm[TASK_COMM_LEN]; @@ -17,19 +17,19 @@ struct key_t { u32 userstack; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) @@ -37,10 +37,15 @@ struct bpf_map_def SEC("maps") stackmap = { SEC("perf_event") int bpf_prog1(struct bpf_perf_event_data *ctx) { + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; + char addr_fmt[] = "Address recorded on event: %llx"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; struct key_t key; u64 *val, one = 1; + int ret; if (ctx->sample_period < 10000) /* ignore warmup */ @@ -54,6 +59,15 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) return 0; } + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + + if (ctx->addr != 0) + bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 7bd827b84a67..9664749bf618 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -1,29 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <stdbool.h> #include <string.h> -#include <fcntl.h> -#include <poll.h> -#include <sys/ioctl.h> #include <linux/perf_event.h> #include <linux/bpf.h> #include <signal.h> -#include <assert.h> #include <errno.h> #include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" +#include "trace_helpers.h" #define SAMPLE_FREQ 50 +static int pid; +/* counts, stackmap */ +static int map_fd[2]; +struct bpf_program *prog; static bool sys_read_seen, sys_write_seen; static void print_ksym(__u64 addr) @@ -33,10 +31,15 @@ static void print_ksym(__u64 addr) if (!addr) return; sym = ksym_search(addr); + if (!sym) { + printf("ksym not found. Is kallsyms loaded?\n"); + return; + } + printf("%s;", sym->name); - if (!strcmp(sym->name, "sys_read")) + if (!strstr(sym->name, "sys_read")) sys_read_seen = true; - else if (!strcmp(sym->name, "sys_write")) + else if (!strstr(sym->name, "sys_write")) sys_write_seen = true; } @@ -88,10 +91,10 @@ static void print_stack(struct key_t *key, __u64 count) } } -static void int_exit(int sig) +static void err_exit(int err) { - kill(0, SIGKILL); - exit(0); + kill(pid, SIGKILL); + exit(err); } static void print_stacks(void) @@ -99,7 +102,7 @@ static void print_stacks(void) struct key_t key = {}, next_key; __u64 value; __u32 stackid = 0, next_id; - int fd = map_fd[0], stack_map = map_fd[1]; + int error = 1, fd = map_fd[0], stack_map = map_fd[1]; sys_read_seen = sys_write_seen = false; while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { @@ -111,7 +114,7 @@ static void print_stacks(void) printf("\n"); if (!sys_read_seen || !sys_write_seen) { printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); - int_exit(0); + err_exit(error); } /* clear stack map */ @@ -121,51 +124,93 @@ static void print_stacks(void) } } +static inline int generate_load(void) +{ + if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) { + printf("failed to generate some load with dd: %s\n", strerror(errno)); + return -1; + } + + return 0; +} + static void test_perf_event_all_cpu(struct perf_event_attr *attr) { - int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); - int *pmu_fd = malloc(nr_cpus * sizeof(int)); - int i, error = 0; + int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *)); + int i, pmu_fd, error = 1; + + if (!links) { + printf("malloc of links failed\n"); + goto err; + } + + /* system wide perf event, no need to inherit */ + attr->inherit = 0; /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { - pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); - if (pmu_fd[i] < 0) { + pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0); + if (pmu_fd < 0) { printf("sys_perf_event_open failed\n"); - error = 1; goto all_cpu_err; } - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); - assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE) == 0); + links[i] = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(links[i])) { + printf("bpf_program__attach_perf_event failed\n"); + links[i] = NULL; + close(pmu_fd); + goto all_cpu_err; + } } - system("dd if=/dev/zero of=/dev/null count=5000k status=none"); + + if (generate_load() < 0) + goto all_cpu_err; + print_stacks(); + error = 0; all_cpu_err: - for (i--; i >= 0; i--) { - ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE); - close(pmu_fd[i]); - } - free(pmu_fd); + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); +err: + free(links); if (error) - int_exit(0); + err_exit(error); } static void test_perf_event_task(struct perf_event_attr *attr) { - int pmu_fd; + struct bpf_link *link = NULL; + int pmu_fd, error = 1; + + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; /* open task bound event */ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { printf("sys_perf_event_open failed\n"); - int_exit(0); + goto err; } - assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); - assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE) == 0); - system("dd if=/dev/zero of=/dev/null count=5000k status=none"); + link = bpf_program__attach_perf_event(prog, pmu_fd); + if (libbpf_get_error(link)) { + printf("bpf_program__attach_perf_event failed\n"); + link = NULL; + close(pmu_fd); + goto err; + } + + if (generate_load() < 0) + goto err; + print_stacks(); - ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); - close(pmu_fd); + error = 0; +err: + bpf_link__destroy(link); + if (error) + err_exit(error); } static void test_bpf_perf_event(void) @@ -175,14 +220,12 @@ static void test_bpf_perf_event(void) .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .inherit = 1, }; struct perf_event_attr attr_type_sw = { .sample_freq = SAMPLE_FREQ, .freq = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, - .inherit = 1, }; struct perf_event_attr attr_hw_cache_l1d = { .sample_freq = SAMPLE_FREQ, @@ -192,7 +235,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), - .inherit = 1, }; struct perf_event_attr attr_hw_cache_branch_miss = { .sample_freq = SAMPLE_FREQ, @@ -202,7 +244,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_BPU | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), - .inherit = 1, }; struct perf_event_attr attr_type_raw = { .sample_freq = SAMPLE_FREQ, @@ -210,7 +251,17 @@ static void test_bpf_perf_event(void) .type = PERF_TYPE_RAW, /* Intel Instruction Retired */ .config = 0xc0, - .inherit = 1, + }; + struct perf_event_attr attr_type_raw_lock_load = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_RAW, + /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */ + .config = 0x21d0, + /* Request to record lock address from PEBS */ + .sample_type = PERF_SAMPLE_ADDR, + /* Record address value requires precise event */ + .precise_ip = 2, }; printf("Test HW_CPU_CYCLES\n"); @@ -233,36 +284,69 @@ static void test_bpf_perf_event(void) test_perf_event_all_cpu(&attr_type_raw); test_perf_event_task(&attr_type_raw); + printf("Test Lock Load\n"); + test_perf_event_all_cpu(&attr_type_raw_lock_load); + test_perf_event_task(&attr_type_raw_lock_load); + printf("*** PASS ***\n"); } int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; char filename[256]; + int error = 1; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + signal(SIGINT, err_exit); + signal(SIGTERM, err_exit); if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); - return 1; + goto cleanup; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 2; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("opening BPF object file failed\n"); + obj = NULL; + goto cleanup; } - if (fork() == 0) { + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + printf("finding a counts/stackmap map in obj file failed\n"); + goto cleanup; + } + + pid = fork(); + if (pid == 0) { read_trace_pipe(); return 0; + } else if (pid == -1) { + printf("couldn't spawn process\n"); + goto cleanup; } + test_bpf_perf_event(); - int_exit(0); - return 0; + error = 0; + +cleanup: + bpf_object__close(obj); + err_exit(error); } diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output.bpf.c index 9b96f4fb8cea..565a73b51b04 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output.bpf.c @@ -1,16 +1,15 @@ -#include <linux/ptrace.h> +#include "vmlinux.h" #include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 2, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} my_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("ksyscall/write") int bpf_prog1(struct pt_regs *ctx) { struct S { diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index ccca1e348017..d316fd2c8e24 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -1,119 +1,10 @@ -/* This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ +// SPDX-License-Identifier: GPL-2.0-only #include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> #include <fcntl.h> #include <poll.h> -#include <linux/perf_event.h> -#include <linux/bpf.h> -#include <errno.h> -#include <assert.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/mman.h> #include <time.h> #include <signal.h> -#include "libbpf.h" -#include "bpf_load.h" -#include "perf-sys.h" - -static int pmu_fd; - -int page_size; -int page_cnt = 8; -volatile struct perf_event_mmap_page *header; - -typedef void (*print_fn)(void *data, int size); - -static int perf_event_mmap(int fd) -{ - void *base; - int mmap_size; - - page_size = getpagesize(); - mmap_size = page_size * (page_cnt + 1); - - base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (base == MAP_FAILED) { - printf("mmap err\n"); - return -1; - } - - header = base; - return 0; -} - -static int perf_event_poll(int fd) -{ - struct pollfd pfd = { .fd = fd, .events = POLLIN }; - - return poll(&pfd, 1, 1000); -} - -struct perf_event_sample { - struct perf_event_header header; - __u32 size; - char data[]; -}; - -static void perf_event_read(print_fn fn) -{ - __u64 data_tail = header->data_tail; - __u64 data_head = header->data_head; - __u64 buffer_size = page_cnt * page_size; - void *base, *begin, *end; - char buf[256]; - - asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ - if (data_head == data_tail) - return; - - base = ((char *)header) + page_size; - - begin = base + data_tail % buffer_size; - end = base + data_head % buffer_size; - - while (begin != end) { - struct perf_event_sample *e; - - e = begin; - if (begin + e->header.size > base + buffer_size) { - long len = base + buffer_size - begin; - - assert(len < e->header.size); - memcpy(buf, begin, len); - memcpy(buf + len, base, e->header.size - len); - e = (void *) buf; - begin = base + e->header.size - len; - } else if (begin + e->header.size == base + buffer_size) { - begin = base; - } else { - begin += e->header.size; - } - - if (e->header.type == PERF_RECORD_SAMPLE) { - fn(e->data, e->size); - } else if (e->header.type == PERF_RECORD_LOST) { - struct { - struct perf_event_header header; - __u64 id; - __u64 lost; - } *lost = (void *) e; - printf("lost %lld events\n", lost->lost); - } else { - printf("unknown event type=%d size=%d\n", - e->header.type, e->header.size); - } - } - - __sync_synchronize(); /* smp_mb() */ - header->data_tail = data_head; -} +#include <bpf/libbpf.h> static __u64 time_get_ns(void) { @@ -124,12 +15,12 @@ static __u64 time_get_ns(void) } static __u64 start_time; +static __u64 cnt; #define MAX_CNT 100000ll -static void print_bpf_output(void *data, int size) +static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) { - static __u64 cnt; struct { __u64 pid; __u64 cookie; @@ -138,7 +29,7 @@ static void print_bpf_output(void *data, int size) if (e->cookie != 0x12345678) { printf("BUG pid %llx cookie %llx sized %d\n", e->pid, e->cookie, size); - kill(0, SIGINT); + return; } cnt++; @@ -146,51 +37,69 @@ static void print_bpf_output(void *data, int size) if (cnt == MAX_CNT) { printf("recv %lld events per sec\n", MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); - kill(0, SIGINT); + return; } } -static void test_bpf_perf_event(void) -{ - struct perf_event_attr attr = { - .sample_type = PERF_SAMPLE_RAW, - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_BPF_OUTPUT, - }; - int key = 0; - - pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); - - assert(pmu_fd >= 0); - assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0); - ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); -} - int main(int argc, char **argv) { + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct perf_buffer *pb; + struct bpf_object *obj; + int map_fd, ret = 0; char filename[256]; FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - test_bpf_perf_event(); + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } - if (perf_event_mmap(pmu_fd) < 0) + pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL); + ret = libbpf_get_error(pb); + if (ret) { + printf("failed to setup perf_buffer: %d\n", ret); return 1; + } f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r"); (void) f; start_time = time_get_ns(); - for (;;) { - perf_event_poll(pmu_fd); - perf_event_read(print_bpf_output); + while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { } + kill(0, SIGINT); - return 0; +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return ret; } diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1.bpf.c index 107da148820f..ceedf0b1d479 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1.bpf.c @@ -4,36 +4,35 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <uapi/linux/bpf.h> +#include "vmlinux.h" +#include "net_shared.h" #include <linux/version.h> -#include "bpf_helpers.h" - -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_tracing.h> /* kprobe is NOT a stable ABI * kernel functions can be removed, renamed or completely change semantics. * Number of arguments and their positions can change, etc. * In such case this bpf+kprobe example will no longer be meaningful */ -SEC("kprobe/__netif_receive_skb_core") +SEC("kprobe.multi/__netif_receive_skb_core*") int bpf_prog1(struct pt_regs *ctx) { - /* attaches to kprobe netif_receive_skb, - * looks for packets on loobpack device and prints them + /* attaches to kprobe __netif_receive_skb_core, + * looks for packets on loopback device and prints them + * (wildcard is used for avoiding symbol mismatch due to optimization) */ char devname[IFNAMSIZ]; struct net_device *dev; struct sk_buff *skb; int len; - /* non-portable! works for the given kernel only */ - skb = (struct sk_buff *) PT_REGS_PARM1(ctx); - dev = _(skb->dev); - len = _(skb->len); + bpf_core_read(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); + dev = BPF_CORE_READ(skb, dev); + len = BPF_CORE_READ(skb, len); - bpf_probe_read(devname, sizeof(devname), dev->name); + BPF_CORE_READ_STR_INTO(&devname, dev, name); if (devname[0] == 'l' && devname[1] == 'o') { char fmt[] = "skb %p len %d\n"; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c index 31a48183beea..8c3d9043a2b6 100644 --- a/samples/bpf/tracex1_user.c +++ b/samples/bpf/tracex1_user.c @@ -1,19 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/libbpf.h> +#include "trace_helpers.h" int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + FILE *f; + + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } f = popen("taskset 1 ping -c5 localhost", "r"); @@ -21,5 +43,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c deleted file mode 100644 index 5e11c20ce5ec..000000000000 --- a/samples/bpf/tracex2_kern.c +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; - -/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe - * example will no longer be meaningful - */ -SEC("kprobe/kfree_skb") -int bpf_prog2(struct pt_regs *ctx) -{ - long loc = 0; - long init_val = 1; - long *value; - - /* read ip of kfree_skb caller. - * non-portable version of __builtin_return_address(0) - */ - BPF_KPROBE_READ_RET_IP(loc, ctx); - - value = bpf_map_lookup_elem(&my_map, &loc); - if (value) - *value += 1; - else - bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); - return 0; -} - -static unsigned int log2(unsigned int v) -{ - unsigned int r; - unsigned int shift; - - r = (v > 0xFFFF) << 4; v >>= r; - shift = (v > 0xFF) << 3; v >>= shift; r |= shift; - shift = (v > 0xF) << 2; v >>= shift; r |= shift; - shift = (v > 0x3) << 1; v >>= shift; r |= shift; - r |= (v >> 1); - return r; -} - -static unsigned int log2l(unsigned long v) -{ - unsigned int hi = v >> 32; - if (hi) - return log2(hi) + 32; - else - return log2(v); -} - -struct hist_key { - char comm[16]; - u64 pid_tgid; - u64 uid_gid; - u64 index; -}; - -struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(struct hist_key), - .value_size = sizeof(long), - .max_entries = 1024, -}; - -SEC("kprobe/sys_write") -int bpf_prog3(struct pt_regs *ctx) -{ - long write_size = PT_REGS_PARM3(ctx); - long init_val = 1; - long *value; - struct hist_key key; - - key.index = log2l(write_size); - key.pid_tgid = bpf_get_current_pid_tgid(); - key.uid_gid = bpf_get_current_uid_gid(); - bpf_get_current_comm(&key.comm, sizeof(key.comm)); - - value = bpf_map_lookup_elem(&my_hist_map, &key); - if (value) - __sync_fetch_and_add(value, 1); - else - bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); - return 0; -} -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c deleted file mode 100644 index 7321a3f253c9..000000000000 --- a/samples/bpf/tracex2_user.c +++ /dev/null @@ -1,159 +0,0 @@ -#include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <signal.h> -#include <linux/bpf.h> -#include <string.h> -#include <sys/resource.h> - -#include "libbpf.h" -#include "bpf_load.h" -#include "bpf_util.h" - -#define MAX_INDEX 64 -#define MAX_STARS 38 - -static void stars(char *str, long val, long max, int width) -{ - int i; - - for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) - str[i] = '*'; - if (val > max) - str[i - 1] = '+'; - str[i] = '\0'; -} - -struct task { - char comm[16]; - __u64 pid_tgid; - __u64 uid_gid; -}; - -struct hist_key { - struct task t; - __u32 index; -}; - -#define SIZE sizeof(struct task) - -static void print_hist_for_pid(int fd, void *task) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct hist_key key = {}, next_key; - long values[nr_cpus]; - char starstr[MAX_STARS]; - long value; - long data[MAX_INDEX] = {}; - int max_ind = -1; - long max_value = 0; - int i, ind; - - while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { - if (memcmp(&next_key, task, SIZE)) { - key = next_key; - continue; - } - bpf_map_lookup_elem(fd, &next_key, values); - value = 0; - for (i = 0; i < nr_cpus; i++) - value += values[i]; - ind = next_key.index; - data[ind] = value; - if (value && ind > max_ind) - max_ind = ind; - if (value > max_value) - max_value = value; - key = next_key; - } - - printf(" syscall write() stats\n"); - printf(" byte_size : count distribution\n"); - for (i = 1; i <= max_ind + 1; i++) { - stars(starstr, data[i - 1], max_value, MAX_STARS); - printf("%8ld -> %-8ld : %-8ld |%-*s|\n", - (1l << i) >> 1, (1l << i) - 1, data[i - 1], - MAX_STARS, starstr); - } -} - -static void print_hist(int fd) -{ - struct hist_key key = {}, next_key; - static struct task tasks[1024]; - int task_cnt = 0; - int i; - - while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { - int found = 0; - - for (i = 0; i < task_cnt; i++) - if (memcmp(&tasks[i], &next_key, SIZE) == 0) - found = 1; - if (!found) - memcpy(&tasks[task_cnt++], &next_key, SIZE); - key = next_key; - } - - for (i = 0; i < task_cnt; i++) { - printf("\npid %d cmd %s uid %d\n", - (__u32) tasks[i].pid_tgid, - tasks[i].comm, - (__u32) tasks[i].uid_gid); - print_hist_for_pid(fd, &tasks[i]); - } - -} - -static void int_exit(int sig) -{ - print_hist(map_fd[1]); - exit(0); -} - -int main(int ac, char **argv) -{ - struct rlimit r = {1024*1024, RLIM_INFINITY}; - char filename[256]; - long key, next_key, value; - FILE *f; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - - /* start 'ping' in the background to have some kfree_skb events */ - f = popen("ping -c5 localhost", "r"); - (void) f; - - /* start 'dd' in the background to have plenty of 'write' syscalls */ - f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); - (void) f; - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - - for (i = 0; i < 5; i++) { - key = 0; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); - printf("location 0x%lx count %ld\n", next_key, value); - key = next_key; - } - if (key) - printf("\n"); - sleep(1); - } - print_hist(map_fd[1]); - - return 0; -} diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3.bpf.c index 9974c3d7c18b..41f37966f5f5 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3.bpf.c @@ -4,29 +4,35 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/skbuff.h> -#include <linux/netdevice.h> +#include "vmlinux.h" #include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(u64), - .max_entries = 4096, +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct start_key { + dev_t dev; + u32 _pad; + sector_t sector; }; -/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe - * example will no longer be meaningful - */ -SEC("kprobe/blk_start_request") -int bpf_prog1(struct pt_regs *ctx) +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, u64); + __uint(max_entries, 4096); +} my_map SEC(".maps"); + +/* from /sys/kernel/tracing/events/block/block_io_start/format */ +SEC("tracepoint/block/block_io_start") +int bpf_prog1(struct trace_event_raw_block_rq *ctx) { - long rq = PT_REGS_PARM1(ctx); u64 val = bpf_ktime_get_ns(); + struct start_key key = { + .dev = ctx->dev, + .sector = ctx->sector + }; - bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); + bpf_map_update_elem(&my_map, &key, &val, BPF_ANY); return 0; } @@ -41,28 +47,33 @@ static unsigned int log2l(unsigned long long n) #define SLOTS 100 -struct bpf_map_def SEC("maps") lat_map = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = SLOTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, SLOTS); +} lat_map SEC(".maps"); -SEC("kprobe/blk_account_io_completion") -int bpf_prog2(struct pt_regs *ctx) +/* from /sys/kernel/tracing/events/block/block_io_done/format */ +SEC("tracepoint/block/block_io_done") +int bpf_prog2(struct trace_event_raw_block_rq *ctx) { - long rq = PT_REGS_PARM1(ctx); + struct start_key key = { + .dev = ctx->dev, + .sector = ctx->sector + }; + u64 *value, l, base; u32 index; - value = bpf_map_lookup_elem(&my_map, &rq); + value = bpf_map_lookup_elem(&my_map, &key); if (!value) return 0; u64 cur_time = bpf_ktime_get_ns(); u64 delta = cur_time - *value; - bpf_map_delete_elem(&my_map, &rq); + bpf_map_delete_elem(&my_map, &key); /* the lines below are computing index = log10(delta)*10 * using integer arithmetic diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index fe372239d505..1002eb0323b4 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <stdlib.h> @@ -10,15 +7,11 @@ #include <unistd.h> #include <stdbool.h> #include <string.h> -#include <linux/bpf.h> -#include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "bpf_util.h" -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) - #define SLOTS 100 static void clear_stats(int fd) @@ -113,21 +106,11 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + int map_fd, i, j = 0; for (i = 1; i < ac; i++) { if (strcmp(argv[i], "-a") == 0) { @@ -142,6 +125,35 @@ int main(int ac, char **argv) } } + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf(" heatmap of IO latency\n"); if (text_only) printf(" %s", sym[num_colors - 1]); @@ -158,9 +170,14 @@ int main(int ac, char **argv) for (i = 0; ; i++) { if (i % 20 == 0) print_banner(); - print_hist(map_fd[1]); + print_hist(map_fd); sleep(2); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4.bpf.c index 6dd8e384de96..d786492fd926 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4.bpf.c @@ -4,22 +4,22 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/ptrace.h> +#include "vmlinux.h" #include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct pair { u64 val; u64 ip; }; -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(struct pair), - .max_entries = 1000000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct pair); + __uint(max_entries, 1000000); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -33,13 +33,13 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kretprobe/kmem_cache_alloc_node") +SEC("kretprobe/kmem_cache_alloc_node_noprof") int bpf_prog2(struct pt_regs *ctx) { long ptr = PT_REGS_RC(ctx); long ip = 0; - /* get ip address of kmem_cache_alloc_node() caller */ + /* get ip address of kmem_cache_alloc_node_noprof() caller */ BPF_KRETPROBE_READ_RET_IP(ip, ctx); struct pair v = { diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index 22c644f1f4c3..a5145ad72cbf 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2015 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <stdio.h> #include <stdlib.h> @@ -11,11 +8,9 @@ #include <stdbool.h> #include <string.h> #include <time.h> -#include <linux/bpf.h> -#include <sys/resource.h> -#include "libbpf.h" -#include "bpf_load.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> struct pair { long long val; @@ -36,11 +31,11 @@ static void print_old_objects(int fd) __u64 key, next_key; struct pair v; - key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + key = write(1, "\e[1;1H\e[2J", 11); /* clear screen */ key = -1; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &v); + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &v); key = next_key; if (val - v.val < 1000000000ll) /* object was allocated more then 1 sec ago */ @@ -52,26 +47,50 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; + int map_fd, j = 0; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } - for (i = 0; ; i++) { - print_old_objects(map_fd[1]); + while (1) { + print_old_objects(map_fd); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5.bpf.c index f57f4e1ea1ec..4d3d6c9b25fa 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5.bpf.c @@ -4,26 +4,27 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <linux/ptrace.h> +#include "vmlinux.h" +#include "syscall_nrs.h" #include <linux/version.h> -#include <uapi/linux/bpf.h> -#include <uapi/linux/seccomp.h> #include <uapi/linux/unistd.h> -#include "syscall_nrs.h" -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#define __stringify(x) #x #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") progs = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); #ifdef __mips__ - .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */ + __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ #else - .max_entries = 1024, + __uint(max_entries, 1024); #endif -}; +} progs SEC(".maps"); SEC("kprobe/__seccomp_filter") int bpf_prog1(struct pt_regs *ctx) @@ -46,7 +47,7 @@ PROG(SYS__NR_write)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] == 512) { char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), @@ -59,7 +60,7 @@ PROG(SYS__NR_read)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] > 128 && sd.args[2] <= 1024) { char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), @@ -68,12 +69,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx) return 0; } +#ifdef __NR_mmap2 +PROG(SYS__NR_mmap2)(struct pt_regs *ctx) +{ + char fmt[] = "mmap2\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} +#endif + +#ifdef __NR_mmap PROG(SYS__NR_mmap)(struct pt_regs *ctx) { char fmt[] = "mmap\n"; + bpf_trace_printk(fmt, sizeof(fmt)); return 0; } +#endif char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index 36b5925bb137..7e2d8397fb98 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -1,12 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> +#include <stdlib.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> #include <sys/prctl.h> -#include "libbpf.h" -#include "bpf_load.h" -#include <sys/resource.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "trace_helpers.h" +#include "bpf_util.h" + +#ifdef __mips__ +#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ +#else +#define MAX_ENTRIES 1024 +#endif /* install fake seccomp program to enable seccomp code path inside the kernel, * so that our kprobe attached to seccomp_phase1() can be triggered @@ -17,7 +25,7 @@ static void install_accept_all_seccomp(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { - .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; if (prctl(PR_SET_SECCOMP, 2, &prog)) @@ -26,16 +34,54 @@ static void install_accept_all_seccomp(void) int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + int key, fd, progs_fd; + const char *section; char filename[256]; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + FILE *f; + + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); + if (progs_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + bpf_object__for_each_program(prog, obj) { + section = bpf_program__section_name(prog); + /* register only syscalls to PROG_ARRAY */ + if (sscanf(section, "kprobe/%d", &key) != 1) + continue; - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + fd = bpf_program__fd(prog); + bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); } install_accept_all_seccomp(); @@ -45,5 +91,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex6.bpf.c b/samples/bpf/tracex6.bpf.c new file mode 100644 index 000000000000..9b23b4737cfb --- /dev/null +++ b/samples/bpf/tracex6.bpf.c @@ -0,0 +1,81 @@ +#include "vmlinux.h" +#include <linux/version.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 64); +} counters SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, u64); + __uint(max_entries, 64); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct bpf_perf_event_value); + __uint(max_entries, 64); +} values2 SEC(".maps"); + +SEC("kprobe/htab_map_get_next_key") +int bpf_prog1(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + u64 count, *val; + s64 error; + + count = bpf_perf_event_read(&counters, key); + error = (s64)count; + if (error <= -2 && error >= -22) + return 0; + + val = bpf_map_lookup_elem(&values, &key); + if (val) + *val = count; + else + bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST); + + return 0; +} + +/* + * Since *_map_lookup_elem can't be expected to trigger bpf programs + * due to potential deadlocks (bpf_disable_instrumentation), this bpf + * program will be attached to bpf_map_copy_value (which is called + * from map_lookup_elem) and will only filter the hashtable type. + */ +SEC("kprobe/bpf_map_copy_value") +int BPF_KPROBE(bpf_prog2, struct bpf_map *map) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + enum bpf_map_type type; + int error; + + type = BPF_CORE_READ(map, map_type); + if (type != BPF_MAP_TYPE_HASH) + return 0; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c deleted file mode 100644 index e7d180305974..000000000000 --- a/samples/bpf/tracex6_kern.c +++ /dev/null @@ -1,41 +0,0 @@ -#include <linux/ptrace.h> -#include <linux/version.h> -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") counters = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = 64, -}; - -SEC("kprobe/htab_map_get_next_key") -int bpf_prog1(struct pt_regs *ctx) -{ - u32 key = bpf_get_smp_processor_id(); - u64 count, *val; - s64 error; - - count = bpf_perf_event_read(&counters, key); - error = (s64)count; - if (error <= -2 && error >= -22) - return 0; - - val = bpf_map_lookup_elem(&values, &key); - if (val) - *val = count; - else - bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST); - - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index a05a99a0752f..ae811ac83bc2 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -1,27 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE #include <assert.h> #include <fcntl.h> #include <linux/perf_event.h> -#include <linux/bpf.h> #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <sys/ioctl.h> -#include <sys/resource.h> #include <sys/time.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> -#include "bpf_load.h" -#include "libbpf.h" +#include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" #define SAMPLE_PERIOD 0x7fffffffffffffffULL +/* counters, values, values2 */ +static int map_fd[3]; + static void check_on_cpu(int cpu, struct perf_event_attr *attr) { + struct bpf_perf_event_value value2; int pmu_fd, error = 0; cpu_set_t set; __u64 value; @@ -46,8 +49,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) fprintf(stderr, "Value missing for CPU %d\n", cpu); error = 1; goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); } - fprintf(stderr, "CPU %d: %llu\n", cpu, value); on_exit: assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); @@ -161,17 +174,49 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int i = 0; + + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } test_bpf_perf_event(); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c deleted file mode 100644 index 219742106bfd..000000000000 --- a/samples/bpf/xdp1_kern.c +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PLUMgrid - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") rxcnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 256, -}; - -static int parse_ipv4(void *data, u64 nh_off, void *data_end) -{ - struct iphdr *iph = data + nh_off; - - if (iph + 1 > data_end) - return 0; - return iph->protocol; -} - -static int parse_ipv6(void *data, u64 nh_off, void *data_end) -{ - struct ipv6hdr *ip6h = data + nh_off; - - if (ip6h + 1 > data_end) - return 0; - return ip6h->nexthdr; -} - -SEC("xdp1") -int xdp_prog1(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - long *value; - u16 h_proto; - u64 nh_off; - u32 ipproto; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - h_proto = eth->h_proto; - - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - - if (h_proto == htons(ETH_P_IP)) - ipproto = parse_ipv4(data, nh_off, data_end); - else if (h_proto == htons(ETH_P_IPV6)) - ipproto = parse_ipv6(data, nh_off, data_end); - else - ipproto = 0; - - value = bpf_map_lookup_elem(&rxcnt, &ipproto); - if (value) - *value += 1; - - return rc; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c deleted file mode 100644 index 2431c0321b71..000000000000 --- a/samples/bpf/xdp1_user.c +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2016 PLUMgrid - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <linux/bpf.h> -#include <linux/if_link.h> -#include <assert.h> -#include <errno.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <libgen.h> - -#include "bpf_load.h" -#include "bpf_util.h" -#include "libbpf.h" - -static int ifindex; -static __u32 xdp_flags; - -static void int_exit(int sig) -{ - set_link_xdp_fd(ifindex, -1, xdp_flags); - exit(0); -} - -/* simple per-protocol drop counter - */ -static void poll_stats(int interval) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - const unsigned int nr_keys = 256; - __u64 values[nr_cpus], prev[nr_keys][nr_cpus]; - __u32 key; - int i; - - memset(prev, 0, sizeof(prev)); - - while (1) { - sleep(interval); - - for (key = 0; key < nr_keys; key++) { - __u64 sum = 0; - - assert(bpf_map_lookup_elem(map_fd[0], &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[key][i]); - if (sum) - printf("proto %u: %10llu pkt/s\n", - key, sum / interval); - memcpy(prev[key], values, sizeof(values)); - } - } -} - -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] IFINDEX\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n", - prog); -} - -int main(int argc, char **argv) -{ - const char *optstr = "SN"; - char filename[256]; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; - break; - default: - usage(basename(argv[0])); - return 1; - } - } - - if (optind == argc) { - usage(basename(argv[0])); - return 1; - } - ifindex = strtoul(argv[optind], NULL, 0); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - - if (!prog_fd[0]) { - printf("load_bpf_file: %s\n", strerror(errno)); - return 1; - } - - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { - printf("link set xdp fd failed\n"); - return 1; - } - - poll_stats(2); - - return 0; -} diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c deleted file mode 100644 index e01288867d15..000000000000 --- a/samples/bpf/xdp2_kern.c +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PLUMgrid - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include "bpf_helpers.h" - -struct bpf_map_def SEC("maps") rxcnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(long), - .max_entries = 256, -}; - -static void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -static int parse_ipv4(void *data, u64 nh_off, void *data_end) -{ - struct iphdr *iph = data + nh_off; - - if (iph + 1 > data_end) - return 0; - return iph->protocol; -} - -static int parse_ipv6(void *data, u64 nh_off, void *data_end) -{ - struct ipv6hdr *ip6h = data + nh_off; - - if (ip6h + 1 > data_end) - return 0; - return ip6h->nexthdr; -} - -SEC("xdp1") -int xdp_prog1(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - long *value; - u16 h_proto; - u64 nh_off; - u32 ipproto; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - h_proto = eth->h_proto; - - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - - if (h_proto == htons(ETH_P_IP)) - ipproto = parse_ipv4(data, nh_off, data_end); - else if (h_proto == htons(ETH_P_IPV6)) - ipproto = parse_ipv6(data, nh_off, data_end); - else - ipproto = 0; - - value = bpf_map_lookup_elem(&rxcnt, &ipproto); - if (value) - *value += 1; - - if (ipproto == IPPROTO_UDP) { - swap_src_dst_mac(data); - rc = XDP_TX; - } - - return rc; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh new file mode 100755 index 000000000000..4bde9d066c46 --- /dev/null +++ b/samples/bpf/xdp2skb_meta.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. +# +# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load +# eBPF programs, both for XDP and clsbpf. Shell script function +# wrappers and even long options parsing is illustrated, for ease of +# use. +# +# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs +# that need to collaborate between XDP and TC hooks. Thus, it is +# convenient that the same tool load both programs that need to work +# together. +# +BPF_FILE=xdp2skb_meta_kern.o +DIR=$(dirname $0) + +[ -z "$TC" ] && TC=tc +[ -z "$IP" ] && IP=ip + +function usage() { + echo "" + echo "Usage: $0 [-vfh] --dev ethX" + echo " -d | --dev : Network device (required)" + echo " --flush : Cleanup flush TC and XDP progs" + echo " --list : (\$LIST) List TC and XDP progs" + echo " -v | --verbose : (\$VERBOSE) Verbose" + echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)" + echo "" +} + +## -- General shell logging cmds -- +function err() { + local exitcode=$1 + shift + echo "ERROR: $@" >&2 + exit $exitcode +} + +function info() { + if [[ -n "$VERBOSE" ]]; then + echo "# $@" + fi +} + +## -- Helper function calls -- + +# Wrapper call for TC and IP +# - Will display the offending command on failure +function _call_cmd() { + local cmd="$1" + local allow_fail="$2" + shift 2 + if [[ -n "$VERBOSE" ]]; then + echo "$cmd $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $cmd "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 2 "Exec error($status) occurred cmd: \"$cmd $@\"" + fi + fi +} +function call_tc() { + _call_cmd "$TC" "" "$@" +} +function call_tc_allow_fail() { + _call_cmd "$TC" "allow_fail" "$@" +} +function call_ip() { + _call_cmd "$IP" "" "$@" +} + +## --- Parse command line arguments / parameters --- +# Using external program "getopt" to get --long-options +OPTIONS=$(getopt -o vfhd: \ + --long verbose,flush,help,list,dev:,dry-run -- "$@") +if (( $? != 0 )); then + err 4 "Error calling getopt" +fi +eval set -- "$OPTIONS" + +unset DEV +unset FLUSH +while true; do + case "$1" in + -d | --dev ) # device + DEV=$2 + info "Device set to: DEV=$DEV" >&2 + shift 2 + ;; + -v | --verbose) + VERBOSE=yes + # info "Verbose mode: VERBOSE=$VERBOSE" >&2 + shift + ;; + --dry-run ) + DRYRUN=yes + VERBOSE=yes + info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2 + shift + ;; + -f | --flush ) + FLUSH=yes + shift + ;; + --list ) + LIST=yes + shift + ;; + -- ) + shift + break + ;; + -h | --help ) + usage; + exit 0 + ;; + * ) + shift + break + ;; + esac +done + +FILE="$DIR/$BPF_FILE" +if [[ ! -e $FILE ]]; then + err 3 "Missing BPF object file ($FILE)" +fi + +if [[ -z $DEV ]]; then + usage + err 2 "Please specify network device -- required option --dev" +fi + +## -- Function calls -- + +function list_tc() +{ + local device="$1" + shift + info "Listing current TC ingress rules" + call_tc filter show dev $device ingress +} + +function list_xdp() +{ + local device="$1" + shift + info "Listing current XDP device($device) setting" + call_ip link show dev $device | grep --color=auto xdp +} + +function flush_tc() +{ + local device="$1" + shift + info "Flush TC on device: $device" + call_tc_allow_fail filter del dev $device ingress + call_tc_allow_fail qdisc del dev $device clsact +} + +function flush_xdp() +{ + local device="$1" + shift + info "Flush XDP on device: $device" + call_ip link set dev $device xdp off +} + +function attach_tc_mark() +{ + local device="$1" + local file="$2" + local prog="tc_mark" + shift 2 + + # Re-attach clsact to clear/flush existing role + call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null + call_tc qdisc add dev $device clsact + + # Attach BPF prog + call_tc filter add dev $device ingress \ + prio 1 handle 1 bpf da obj $file sec $prog +} + +function attach_xdp_mark() +{ + local device="$1" + local file="$2" + local prog="xdp_mark" + shift 2 + + # Remove XDP prog in-case it's already loaded + # TODO: Need ip-link option to override/replace existing XDP prog + flush_xdp $device + + # Attach XDP/BPF prog + call_ip link set dev $device xdp obj $file sec $prog +} + +if [[ -n $FLUSH ]]; then + flush_tc $DEV + flush_xdp $DEV + exit 0 +fi + +if [[ -n $LIST ]]; then + list_tc $DEV + list_xdp $DEV + exit 0 +fi + +attach_tc_mark $DEV $FILE +attach_xdp_mark $DEV $FILE diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c new file mode 100644 index 000000000000..3c36c25d9902 --- /dev/null +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto transfer info from XDP to SKB, e.g. skb->mark + * ----------------------------------------------------------- + * This uses the XDP data_meta infrastructure, and is a cooperation + * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. + * + * Notice: This example does not use the BPF C-loader, + * but instead rely on the iproute2 TC tool for loading BPF-objects. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include <bpf/bpf_helpers.h> + +/* + * This struct is stored in the XDP 'data_meta' area, which is located + * just in-front-of the raw packet payload data. The meaning is + * specific to these two BPF programs that use it as a communication + * channel. XDP adjust/increase the area via a bpf-helper, and TC use + * boundary checks to see if data have been provided. + * + * The struct must be 4 byte aligned, which here is enforced by the + * struct __attribute__((aligned(4))). + */ +struct meta_info { + __u32 mark; +} __attribute__((aligned(4))); + +SEC("xdp_mark") +int _xdp_mark(struct xdp_md *ctx) +{ + struct meta_info *meta; + void *data; + int ret; + + /* Reserve space in-front of data pointer for our meta info. + * (Notice drivers not supporting data_meta will fail here!) + */ + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); + if (ret < 0) + return XDP_ABORTED; + + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() + */ + data = (void *)(unsigned long)ctx->data; + + /* Check data_meta have room for meta_info struct */ + meta = (void *)(unsigned long)ctx->data_meta; + if (meta + 1 > data) + return XDP_ABORTED; + + meta->mark = 42; + + return XDP_PASS; +} + +SEC("tc_mark") +int _tc_mark(struct __sk_buff *ctx) +{ + void *data = (void *)(unsigned long)ctx->data; + void *data_meta = (void *)(unsigned long)ctx->data_meta; + struct meta_info *meta = data_meta; + + /* Check XDP gave us some data_meta */ + if (meta + 1 > data) { + ctx->mark = 41; + /* Skip "accept" if no data_meta is avail */ + return TC_ACT_OK; + } + + /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */ + ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */ + + return TC_ACT_OK; +} + +/* Manually attaching these programs: +export DEV=ixgbe2 +export FILE=xdp2skb_meta_kern.o + +# via TC command +tc qdisc del dev $DEV clsact 2> /dev/null +tc qdisc add dev $DEV clsact +tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark +tc filter show dev $DEV ingress + +# XDP via IP command: +ip link set dev $DEV xdp off +ip link set dev $DEV xdp obj $FILE sec xdp_mark + +# Use iptable to "see" if SKBs are marked +iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29 +iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a + +# Hint: catch XDP_ABORTED errors via +perf record -e xdp:* +perf script + +*/ diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c new file mode 100644 index 000000000000..da67bcad1c63 --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_kern.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program shows how to use bpf_xdp_adjust_tail() by + * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed + * to be more preice in case of v4)" where receiving packets bigger then + * 600 bytes. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <bpf/bpf_helpers.h> + +#define DEFAULT_TTL 64 +#define MAX_PCKT_SIZE 600 +#define ICMP_TOOBIG_SIZE 98 +#define ICMP_TOOBIG_PAYLOAD_SIZE 92 + +/* volatile to prevent compiler optimizations */ +static volatile __u32 max_pcktsz = MAX_PCKT_SIZE; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 1); +} icmpcnt SEC(".maps"); + +static __always_inline void count_icmp(void) +{ + u64 key = 0; + u64 *icmp_count; + + icmp_count = bpf_map_lookup_elem(&icmpcnt, &key); + if (icmp_count) + *icmp_count += 1; +} + +static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth) +{ + struct ethhdr *eth; + + eth = data; + memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN); + memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN); + eth->h_proto = orig_eth->h_proto; +} + +static __always_inline __u16 csum_fold_helper(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + return ~((csum & 0xffff) + (csum >> 16)); +} + +static __always_inline void ipv4_csum(void *data_start, int data_size, + __u32 *csum) +{ + *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); + *csum = csum_fold_helper(*csum); +} + +static __always_inline int send_icmp4_too_big(struct xdp_md *xdp) +{ + int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr); + + if (bpf_xdp_adjust_head(xdp, 0 - headroom)) + return XDP_DROP; + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + + if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end) + return XDP_DROP; + + struct iphdr *iph, *orig_iph; + struct icmphdr *icmp_hdr; + struct ethhdr *orig_eth; + __u32 csum = 0; + __u64 off = 0; + + orig_eth = data + headroom; + swap_mac(data, orig_eth); + off += sizeof(struct ethhdr); + iph = data + off; + off += sizeof(struct iphdr); + icmp_hdr = data + off; + off += sizeof(struct icmphdr); + orig_iph = data + off; + icmp_hdr->type = ICMP_DEST_UNREACH; + icmp_hdr->code = ICMP_FRAG_NEEDED; + icmp_hdr->un.frag.mtu = htons(max_pcktsz - sizeof(struct ethhdr)); + icmp_hdr->checksum = 0; + ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum); + icmp_hdr->checksum = csum; + iph->ttl = DEFAULT_TTL; + iph->daddr = orig_iph->saddr; + iph->saddr = orig_iph->daddr; + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_ICMP; + iph->tos = 0; + iph->tot_len = htons( + ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr)); + iph->check = 0; + csum = 0; + ipv4_csum(iph, sizeof(struct iphdr), &csum); + iph->check = csum; + count_icmp(); + return XDP_TX; +} + + +static __always_inline int handle_ipv4(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + int pckt_size = data_end - data; + int offset; + + if (pckt_size > max(max_pcktsz, ICMP_TOOBIG_SIZE)) { + offset = pckt_size - ICMP_TOOBIG_SIZE; + if (bpf_xdp_adjust_tail(xdp, 0 - offset)) + return XDP_PASS; + return send_icmp4_too_big(xdp); + } + return XDP_PASS; +} + +SEC("xdp_icmp") +int _xdp_icmp(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + __u16 h_proto; + + if (eth + 1 > data_end) + return XDP_DROP; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_IP)) + return handle_ipv4(xdp); + else + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c new file mode 100644 index 000000000000..e9426bd65420 --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <net/if.h> +#include <arpa/inet.h> +#include <netinet/ether.h> +#include <unistd.h> +#include <time.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#define STATS_INTERVAL_S 2U +#define MAX_PCKT_SIZE 600 + +static int ifindex = -1; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static __u32 prog_id; + +static void int_exit(int sig) +{ + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_xdp_detach(ifindex, xdp_flags, NULL); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given iface\n"); + else + printf("program on interface changed, not removing\n"); + } + exit(0); +} + +/* simple "icmp packet too big sent" counter + */ +static void poll_stats(unsigned int map_fd, unsigned int kill_after_s) +{ + time_t started_at = time(NULL); + __u64 value = 0; + int key = 0; + + + while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { + sleep(STATS_INTERVAL_S); + + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + + printf("icmp \"packet too big\" sent: %10llu pkts\n", value); + } +} + +static void usage(const char *cmd) +{ + printf("Start a XDP prog which send ICMP \"packet too big\" \n" + "messages if ingress packet is bigger then MAX_SIZE bytes\n"); + printf("Usage: %s [...]\n", cmd); + printf(" -i <ifname|ifindex> Interface\n"); + printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); + printf(" -P <MAX_PCKT_SIZE> Default: %u\n", MAX_PCKT_SIZE); + printf(" -S use skb-mode\n"); + printf(" -N enforce native mode\n"); + printf(" -F force loading prog\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + unsigned char opt_flags[256] = {}; + const char *optstr = "i:T:P:SNFh"; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + unsigned int kill_after_s = 0; + int i, prog_fd, map_fd, opt; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 max_pckt_size = 0; + __u32 key = 0; + char filename[256]; + int err; + + for (i = 0; i < strlen(optstr); i++) + if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') + opt_flags[(unsigned char)optstr[i]] = 1; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + + switch (opt) { + case 'i': + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); + break; + case 'T': + kill_after_s = atoi(optarg); + break; + case 'P': + max_pckt_size = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + default: + usage(argv[0]); + return 1; + } + opt_flags[opt] = 0; + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + for (i = 0; i < strlen(optstr); i++) { + if (opt_flags[(unsigned int)optstr[i]]) { + fprintf(stderr, "Missing argument -%c\n", optstr[i]); + usage(argv[0]); + return 1; + } + } + + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); + + /* static global var 'max_pcktsz' is accessible from .data section */ + if (max_pckt_size) { + map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data"); + if (map_fd < 0) { + printf("finding a max_pcktsz map in obj file failed\n"); + return 1; + } + bpf_map_update_elem(map_fd, &key, &max_pckt_size, BPF_ANY); + } + + /* fetch icmpcnt map */ + map_fd = bpf_object__find_map_fd_by_name(obj, "icmpcnt"); + if (map_fd < 0) { + printf("finding a icmpcnt map in obj file failed\n"); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return 1; + } + prog_id = info.id; + + poll_stats(map_fd, kill_after_s); + int_exit(0); + + return 0; +} diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c new file mode 100644 index 000000000000..54c099cbd639 --- /dev/null +++ b/samples/bpf/xdp_fwd_kern.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include <bpf/bpf_helpers.h> + +#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 64); +} xdp_tx_ports SEC(".maps"); + +/* from include/net/ip.h */ +static __always_inline int ip_decrease_ttl(struct iphdr *iph) +{ + u32 check = (__force u32)iph->check; + + check += (__force u32)htons(0x0100); + iph->check = (__force __sum16)(check + (check >= 0xFFFF)); + return --iph->ttl; +} + +static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct bpf_fib_lookup fib_params; + struct ethhdr *eth = data; + struct ipv6hdr *ip6h; + struct iphdr *iph; + u16 h_proto; + u64 nh_off; + int rc; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + __builtin_memset(&fib_params, 0, sizeof(fib_params)); + + h_proto = eth->h_proto; + if (h_proto == htons(ETH_P_IP)) { + iph = data + nh_off; + + if (iph + 1 > data_end) + return XDP_DROP; + + if (iph->ttl <= 1) + return XDP_PASS; + + fib_params.family = AF_INET; + fib_params.tos = iph->tos; + fib_params.l4_protocol = iph->protocol; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = ntohs(iph->tot_len); + fib_params.ipv4_src = iph->saddr; + fib_params.ipv4_dst = iph->daddr; + } else if (h_proto == htons(ETH_P_IPV6)) { + struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; + struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; + + ip6h = data + nh_off; + if (ip6h + 1 > data_end) + return XDP_DROP; + + if (ip6h->hop_limit <= 1) + return XDP_PASS; + + fib_params.family = AF_INET6; + fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; + fib_params.l4_protocol = ip6h->nexthdr; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = ntohs(ip6h->payload_len); + *src = ip6h->saddr; + *dst = ip6h->daddr; + } else { + return XDP_PASS; + } + + fib_params.ifindex = ctx->ingress_ifindex; + + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); + /* + * Some rc (return codes) from bpf_fib_lookup() are important, + * to understand how this XDP-prog interacts with network stack. + * + * BPF_FIB_LKUP_RET_NO_NEIGH: + * Even if route lookup was a success, then the MAC-addresses are also + * needed. This is obtained from arp/neighbour table, but if table is + * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid + * doing ARP lookup directly from XDP, then send packet to normal + * network stack via XDP_PASS and expect it will do ARP resolution. + * + * BPF_FIB_LKUP_RET_FWD_DISABLED: + * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding + * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not + * enabled this on ingress device. + */ + if (rc == BPF_FIB_LKUP_RET_SUCCESS) { + /* Verify egress index has been configured as TX-port. + * (Note: User can still have inserted an egress ifindex that + * doesn't support XDP xmit, which will result in packet drops). + * + * Note: lookup in devmap supported since 0cdbb4b09a0. + * If not supported will fail with: + * cannot pass map_type 14 into func bpf_map_lookup_elem#1: + */ + if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex)) + return XDP_PASS; + + if (h_proto == htons(ETH_P_IP)) + ip_decrease_ttl(iph); + else if (h_proto == htons(ETH_P_IPV6)) + ip6h->hop_limit--; + + memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0); + } + + return XDP_PASS; +} + +SEC("xdp_fwd") +int xdp_fwd_prog(struct xdp_md *ctx) +{ + return xdp_fwd_flags(ctx, 0); +} + +SEC("xdp_fwd_direct") +int xdp_fwd_direct_prog(struct xdp_md *ctx) +{ + return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c new file mode 100644 index 000000000000..193b3b79b31f --- /dev/null +++ b/samples/bpf/xdp_fwd_user.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> +#include <linux/if_link.h> +#include <linux/limits.h> +#include <net/if.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <libgen.h> + +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + +static int do_attach(int idx, int prog_fd, int map_fd, const char *name) +{ + int err; + + err = bpf_xdp_attach(idx, prog_fd, xdp_flags, NULL); + if (err < 0) { + printf("ERROR: failed to attach program to %s\n", name); + return err; + } + + /* Adding ifindex as a possible egress TX port */ + err = bpf_map_update_elem(map_fd, &idx, &idx, 0); + if (err) + printf("ERROR: failed using device %s as TX-port\n", name); + + return err; +} + +static int do_detach(int ifindex, const char *ifname, const char *app_name) +{ + LIBBPF_OPTS(bpf_xdp_attach_opts, opts); + struct bpf_prog_info prog_info = {}; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 info_len, curr_prog_id; + int prog_fd; + int err = 1; + + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("ERROR: bpf_xdp_query_id failed (%s)\n", + strerror(errno)); + return err; + } + + if (!curr_prog_id) { + printf("ERROR: flags(0x%x) xdp prog is not attached to %s\n", + xdp_flags, ifname); + return err; + } + + info_len = sizeof(prog_info); + prog_fd = bpf_prog_get_fd_by_id(curr_prog_id); + if (prog_fd < 0) { + printf("ERROR: bpf_prog_get_fd_by_id failed (%s)\n", + strerror(errno)); + return prog_fd; + } + + err = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (err) { + printf("ERROR: bpf_prog_get_info_by_fd failed (%s)\n", + strerror(errno)); + goto close_out; + } + snprintf(prog_name, sizeof(prog_name), "%s_prog", app_name); + prog_name[BPF_OBJ_NAME_LEN - 1] = '\0'; + + if (strcmp(prog_info.name, prog_name)) { + printf("ERROR: %s isn't attached to %s\n", app_name, ifname); + err = 1; + goto close_out; + } + + opts.old_prog_fd = prog_fd; + err = bpf_xdp_detach(ifindex, xdp_flags, &opts); + if (err < 0) + printf("ERROR: failed to detach program from %s (%s)\n", + ifname, strerror(errno)); + /* TODO: Remember to cleanup map, when adding use of shared map + * bpf_map_delete_elem((map_fd, &idx); + */ +close_out: + close(prog_fd); + return err; +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [OPTS] interface-list\n" + "\nOPTS:\n" + " -d detach program\n" + " -S use skb-mode\n" + " -F force loading prog\n" + " -D direct table lookups (skip fib rules)\n", + prog); +} + +int main(int argc, char **argv) +{ + const char *prog_name = "xdp_fwd"; + struct bpf_program *prog = NULL; + struct bpf_program *pos; + const char *sec_name; + int prog_fd = -1, map_fd = -1; + char filename[PATH_MAX]; + struct bpf_object *obj; + int opt, i, idx, err; + int attach = 1; + int ret = 0; + + while ((opt = getopt(argc, argv, ":dDSF")) != -1) { + switch (opt) { + case 'd': + attach = 0; + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + break; + case 'D': + prog_name = "xdp_fwd_direct"; + break; + default: + usage(basename(argv[0])); + return 1; + } + } + + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + if (attach) { + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (access(filename, O_RDONLY) < 0) { + printf("error accessing file %s: %s\n", + filename, strerror(errno)); + return 1; + } + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) { + printf("Does kernel support devmap lookup?\n"); + /* If not, the error message will be: + * "cannot pass map_type 14 into func bpf_map_lookup_elem#1" + */ + return 1; + } + + bpf_object__for_each_program(pos, obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, prog_name)) { + prog = pos; + break; + } + } + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + printf("program not found: %s\n", strerror(prog_fd)); + return 1; + } + map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, + "xdp_tx_ports")); + if (map_fd < 0) { + printf("map not found: %s\n", strerror(map_fd)); + return 1; + } + } + + for (i = optind; i < argc; ++i) { + idx = if_nametoindex(argv[i]); + if (!idx) + idx = strtoul(argv[i], NULL, 0); + + if (!idx) { + fprintf(stderr, "Invalid arg\n"); + return 1; + } + if (!attach) { + err = do_detach(idx, argv[i], prog_name); + if (err) + ret = err; + } else { + err = do_attach(idx, prog_fd, map_fd, argv[i]); + if (err) + ret = err; + } + } + + return ret; +} diff --git a/samples/bpf/xdp_router_ipv4.bpf.c b/samples/bpf/xdp_router_ipv4.bpf.c new file mode 100644 index 000000000000..0643330d1d2e --- /dev/null +++ b/samples/bpf/xdp_router_ipv4.bpf.c @@ -0,0 +1,189 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie */ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(struct trie_value)); + __uint(max_entries, 50); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_map SEC(".maps"); + +/* Map for ARP table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, __be64); + __uint(max_entries, 50); +} arp_table SEC(".maps"); + +/* Map to keep the exact match entries in the route table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct direct_map); + __uint(max_entries, 50); +} exact_match SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 100); +} tx_port SEC(".maps"); + +SEC("xdp") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off = sizeof(*eth); + struct datarec *rec; + __be16 h_proto; + u32 key = 0; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (rec) + NO_TEAR_INC(rec->processed); + + if (data + nh_off > data_end) + goto drop; + + h_proto = eth->h_proto; + if (h_proto == bpf_htons(ETH_P_8021Q) || + h_proto == bpf_htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + goto drop; + + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + switch (bpf_ntohs(h_proto)) { + case ETH_P_ARP: + if (rec) + NO_TEAR_INC(rec->xdp_pass); + return XDP_PASS; + case ETH_P_IP: { + struct iphdr *iph = data + nh_off; + struct direct_map *direct_entry; + __be64 *dest_mac, *src_mac; + int forward_to; + + if (iph + 1 > data_end) + goto drop; + + direct_entry = bpf_map_lookup_elem(&exact_match, &iph->daddr); + + /* Check for exact match, this would give a faster lookup */ + if (direct_entry && direct_entry->mac && + direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + struct trie_value *prefix_value; + union key_4 key4; + + /* Look up in the trie for lpm */ + key4.b32[0] = 32; + key4.b8[4] = iph->daddr & 0xff; + key4.b8[5] = (iph->daddr >> 8) & 0xff; + key4.b8[6] = (iph->daddr >> 16) & 0xff; + key4.b8[7] = (iph->daddr >> 24) & 0xff; + + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + goto drop; + + forward_to = prefix_value->ifindex; + src_mac = &prefix_value->value; + if (!src_mac) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, &iph->daddr); + if (!dest_mac) { + if (!prefix_value->gw) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, + &prefix_value->gw); + if (!dest_mac) { + /* Forward the packet to the kernel in + * order to trigger ARP discovery for + * the default gw. + */ + if (rec) + NO_TEAR_INC(rec->xdp_pass); + return XDP_PASS; + } + } + } + + if (src_mac && dest_mac) { + int ret; + + __builtin_memcpy(eth->h_dest, dest_mac, ETH_ALEN); + __builtin_memcpy(eth->h_source, src_mac, ETH_ALEN); + + ret = bpf_redirect_map(&tx_port, forward_to, 0); + if (ret == XDP_REDIRECT) { + if (rec) + NO_TEAR_INC(rec->xdp_redirect); + return ret; + } + } + } + default: + break; + } +drop: + if (rec) + NO_TEAR_INC(rec->xdp_drop); + + return XDP_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c new file mode 100644 index 000000000000..266fdd0b025d --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -0,0 +1,699 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2017 Cavium, Inc. + */ +#include <linux/bpf.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <arpa/inet.h> +#include <fcntl.h> +#include <poll.h> +#include <net/if.h> +#include <netdb.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include "bpf_util.h" +#include <bpf/libbpf.h> +#include <libgen.h> +#include <getopt.h> +#include <pthread.h> +#include "xdp_sample_user.h" +#include "xdp_router_ipv4.skel.h" + +static const char *__doc__ = +"XDP IPv4 router implementation\n" +"Usage: xdp_router_ipv4 <IFNAME-0> ... <IFNAME-N>\n"; + +static char buf[8192]; +static int lpm_map_fd; +static int arp_table_map_fd; +static int exact_match_map_fd; +static int tx_port_map_fd; + +static bool routes_thread_exit; +static int interval = 5; + +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT; + +DEFINE_SAMPLE_INIT(xdp_router_ipv4); + +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + { "stats", no_argument, NULL, 's' }, + {} +}; + +static int get_route_table(int rtm_family); + +static int recv_msg(struct sockaddr_nl sock_addr, int sock) +{ + struct nlmsghdr *nh; + int len, nll = 0; + char *buf_ptr; + + buf_ptr = buf; + while (1) { + len = recv(sock, buf_ptr, sizeof(buf) - nll, 0); + if (len < 0) + return len; + + nh = (struct nlmsghdr *)buf_ptr; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + buf_ptr += len; + nll += len; + if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH) + break; + + if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) + break; + } + return nll; +} + +/* Function to parse the route entry returned by netlink + * Updates the route entry related map entries + */ +static void read_route(struct nlmsghdr *nh, int nll) +{ + char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; + struct bpf_lpm_trie_key_u8 *prefix_key; + struct rtattr *rt_attr; + struct rtmsg *rt_msg; + int rtm_family; + int rtl; + int i; + struct route_table { + int dst_len, iface, metric; + __be32 dst, gw; + __be64 mac; + } route; + struct arp_table { + __be64 mac; + __be32 dst; + }; + + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + memset(&route, 0, sizeof(route)); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct rtmsg *)NLMSG_DATA(nh); + rtm_family = rt_msg->rtm_family; + if (rtm_family == AF_INET) + if (rt_msg->rtm_table != RT_TABLE_MAIN) + continue; + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + rtl = RTM_PAYLOAD(nh); + + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + (*((__be32 *)RTA_DATA(rt_attr)))); + break; + case RTA_GATEWAY: + sprintf(gws, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case RTA_OIF: + sprintf(ifs, "%u", + *((int *)RTA_DATA(rt_attr))); + break; + case RTA_METRICS: + sprintf(metrics, "%u", + *((int *)RTA_DATA(rt_attr))); + default: + break; + } + } + sprintf(dsts_len, "%d", rt_msg->rtm_dst_len); + route.dst = atoi(dsts); + route.dst_len = atoi(dsts_len); + route.gw = atoi(gws); + route.iface = atoi(ifs); + route.metric = atoi(metrics); + assert(get_mac_addr(route.iface, &route.mac) == 0); + assert(bpf_map_update_elem(tx_port_map_fd, + &route.iface, &route.iface, 0) == 0); + if (rtm_family == AF_INET) { + struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; + } *prefix_value; + + prefix_key = alloca(sizeof(*prefix_key) + 4); + prefix_value = alloca(sizeof(*prefix_value)); + + prefix_key->prefixlen = 32; + prefix_key->prefixlen = route.dst_len; + direct_entry.mac = route.mac & 0xffffffffffff; + direct_entry.ifindex = route.iface; + direct_entry.arp.mac = 0; + direct_entry.arp.dst = 0; + if (route.dst_len == 32) { + if (nh->nlmsg_type == RTM_DELROUTE) { + assert(bpf_map_delete_elem(exact_match_map_fd, + &route.dst) == 0); + } else { + if (bpf_map_lookup_elem(arp_table_map_fd, + &route.dst, + &direct_entry.arp.mac) == 0) + direct_entry.arp.dst = route.dst; + assert(bpf_map_update_elem(exact_match_map_fd, + &route.dst, + &direct_entry, 0) == 0); + } + } + for (i = 0; i < 4; i++) + prefix_key->data[i] = (route.dst >> i * 8) & 0xff; + + if (bpf_map_lookup_elem(lpm_map_fd, prefix_key, + prefix_value) < 0) { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = prefix_key->data[i]; + prefix_value->value = route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + + assert(bpf_map_update_elem(lpm_map_fd, + prefix_key, + prefix_value, 0 + ) == 0); + } else { + if (nh->nlmsg_type == RTM_DELROUTE) { + assert(bpf_map_delete_elem(lpm_map_fd, + prefix_key + ) == 0); + /* Rereading the route table to check if + * there is an entry with the same + * prefix but a different metric as the + * deleted entry. + */ + get_route_table(AF_INET); + } else if (prefix_key->data[0] == + prefix_value->prefix[0] && + prefix_key->data[1] == + prefix_value->prefix[1] && + prefix_key->data[2] == + prefix_value->prefix[2] && + prefix_key->data[3] == + prefix_value->prefix[3] && + route.metric >= prefix_value->metric) { + continue; + } else { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = + prefix_key->data[i]; + prefix_value->value = + route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + assert(bpf_map_update_elem(lpm_map_fd, + prefix_key, + prefix_value, + 0) == 0); + } + } + } + memset(&route, 0, sizeof(route)); + memset(dsts, 0, sizeof(dsts)); + memset(dsts_len, 0, sizeof(dsts_len)); + memset(gws, 0, sizeof(gws)); + memset(ifs, 0, sizeof(ifs)); + memset(&route, 0, sizeof(route)); + } +} + +/* Function to read the existing route table when the process is launched*/ +static int get_route_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + + struct { + struct nlmsghdr nl; + struct rtmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETROUTE; + + req.rt.rtm_family = rtm_family; + req.rt.rtm_table = RT_TABLE_MAIN; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to parse the arp entry returned by netlink + * Updates the arp entry related map entries + */ +static void read_arp(struct nlmsghdr *nh, int nll) +{ + struct rtattr *rt_attr; + char dsts[24], mac[24]; + struct ndmsg *rt_msg; + int rtl, ndm_family; + + struct arp_table { + __be64 mac; + __be32 dst; + } arp_entry; + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct ndmsg *)NLMSG_DATA(nh); + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + ndm_family = rt_msg->ndm_family; + rtl = RTM_PAYLOAD(nh); + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case NDA_LLADDR: + sprintf(mac, "%lld", + *((__be64 *)RTA_DATA(rt_attr))); + break; + default: + break; + } + } + arp_entry.dst = atoi(dsts); + arp_entry.mac = atol(mac); + + if (ndm_family == AF_INET) { + if (bpf_map_lookup_elem(exact_match_map_fd, + &arp_entry.dst, + &direct_entry) == 0) { + if (nh->nlmsg_type == RTM_DELNEIGH) { + direct_entry.arp.dst = 0; + direct_entry.arp.mac = 0; + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + direct_entry.arp.dst = arp_entry.dst; + direct_entry.arp.mac = arp_entry.mac; + } + assert(bpf_map_update_elem(exact_match_map_fd, + &arp_entry.dst, + &direct_entry, 0 + ) == 0); + memset(&direct_entry, 0, sizeof(direct_entry)); + } + if (nh->nlmsg_type == RTM_DELNEIGH) { + assert(bpf_map_delete_elem(arp_table_map_fd, + &arp_entry.dst) == 0); + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + assert(bpf_map_update_elem(arp_table_map_fd, + &arp_entry.dst, + &arp_entry.mac, 0 + ) == 0); + } + } + memset(&arp_entry, 0, sizeof(arp_entry)); + memset(dsts, 0, sizeof(dsts)); + } +} + +/* Function to read the existing arp table when the process is launched*/ +static int get_arp_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + struct { + struct nlmsghdr nl; + struct ndmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETNEIGH; + req.rt.ndm_state = NUD_REACHABLE; + req.rt.ndm_family = rtm_family; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to keep track and update changes in route and arp table + * Give regular statistics of packets forwarded + */ +static void *monitor_routes_thread(void *arg) +{ + struct pollfd fds_route, fds_arp; + struct sockaddr_nl la, lr; + int sock, sock_arp, nll; + struct nlmsghdr *nh; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return NULL; + } + + fcntl(sock, F_SETFL, O_NONBLOCK); + memset(&lr, 0, sizeof(lr)); + lr.nl_family = AF_NETLINK; + lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; + if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + close(sock); + return NULL; + } + + fds_route.fd = sock; + fds_route.events = POLL_IN; + + sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock_arp < 0) { + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + close(sock); + return NULL; + } + + fcntl(sock_arp, F_SETFL, O_NONBLOCK); + memset(&la, 0, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; + if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + goto cleanup; + } + + fds_arp.fd = sock_arp; + fds_arp.events = POLL_IN; + + /* dump route and arp tables */ + if (get_arp_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading arp table\n"); + goto cleanup; + } + + if (get_route_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading route table\n"); + goto cleanup; + } + + while (!routes_thread_exit) { + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); + } + + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } + + sleep(interval); + } + +cleanup: + close(sock_arp); + close(sock); + return NULL; +} + +static void usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error, + struct bpf_object *obj) +{ + sample_usage(argv, long_options, doc, mask, error); +} + +int main(int argc, char **argv) +{ + bool error = true, generic = false, force = false; + int opt, ret = EXIT_FAIL_BPF; + struct xdp_router_ipv4 *skel; + int i, total_ifindex = argc - 1; + char **ifname_list = argv + 1; + pthread_t routes_thread; + int longindex = 0; + + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + goto end; + } + + skel = xdp_router_ipv4__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n", + strerror(errno)); + goto end; + } + + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", + strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = xdp_router_ipv4__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n", + strerror(errno)); + goto end_destroy; + } + + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + while ((opt = getopt_long(argc, argv, "si:SFvh", + long_options, &longindex)) != -1) { + switch (opt) { + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + total_ifindex--; + ifname_list++; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + total_ifindex -= 2; + ifname_list += 2; + break; + case 'S': + generic = true; + total_ifindex--; + ifname_list++; + break; + case 'F': + force = true; + total_ifindex--; + ifname_list++; + break; + case 'v': + sample_switch_mode(); + total_ifindex--; + ifname_list++; + break; + case 'h': + error = false; + default: + usage(argv, long_options, __doc__, mask, error, skel->obj); + goto end_destroy; + } + } + + ret = EXIT_FAIL_OPTION; + if (optind == argc) { + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_destroy; + } + + lpm_map_fd = bpf_map__fd(skel->maps.lpm_map); + if (lpm_map_fd < 0) { + fprintf(stderr, "Failed loading lpm_map %s\n", + strerror(-lpm_map_fd)); + goto end_destroy; + } + arp_table_map_fd = bpf_map__fd(skel->maps.arp_table); + if (arp_table_map_fd < 0) { + fprintf(stderr, "Failed loading arp_table_map_fd %s\n", + strerror(-arp_table_map_fd)); + goto end_destroy; + } + exact_match_map_fd = bpf_map__fd(skel->maps.exact_match); + if (exact_match_map_fd < 0) { + fprintf(stderr, "Failed loading exact_match_map_fd %s\n", + strerror(-exact_match_map_fd)); + goto end_destroy; + } + tx_port_map_fd = bpf_map__fd(skel->maps.tx_port); + if (tx_port_map_fd < 0) { + fprintf(stderr, "Failed loading tx_port_map_fd %s\n", + strerror(-tx_port_map_fd)); + goto end_destroy; + } + + ret = EXIT_FAIL_XDP; + for (i = 0; i < total_ifindex; i++) { + int index = if_nametoindex(ifname_list[i]); + + if (!index) { + fprintf(stderr, "Interface %s not found %s\n", + ifname_list[i], strerror(-tx_port_map_fd)); + goto end_destroy; + } + if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog, + index, generic, force) < 0) + goto end_destroy; + } + + ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL); + if (ret) { + fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + ret = sample_run(interval, NULL, NULL); + routes_thread_exit = true; + + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_thread_wait; + } + ret = EXIT_OK; + +end_thread_wait: + pthread_join(routes_thread, NULL); +end_destroy: + xdp_router_ipv4__destroy(skel); +end: + sample_exit(ret); +} diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c new file mode 100644 index 000000000000..0eb7e1dcae22 --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 +/* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ +#include "xdp_sample.bpf.h" + +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +array_map rx_cnt SEC(".maps"); +array_map redir_err_cnt SEC(".maps"); +array_map cpumap_enqueue_cnt SEC(".maps"); +array_map cpumap_kthread_cnt SEC(".maps"); +array_map exception_cnt SEC(".maps"); +array_map devmap_xmit_cnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 32 * 32); + __type(key, u64); + __type(value, struct datarec); +} devmap_xmit_cnt_multi SEC(".maps"); + +const volatile int nr_cpus = 0; + +/* These can be set before loading so that redundant comparisons can be DCE'd by + * the verifier, and only actual matches are tried after loading tp_btf program. + * This allows sample to filter tracepoint stats based on net_device. + */ +const volatile int from_match[32] = {}; +const volatile int to_match[32] = {}; + +int cpumap_map_id = 0; + +/* Find if b is part of set a, but if a is empty set then evaluate to true */ +#define IN_SET(a, b) \ + ({ \ + bool __res = !(a)[0]; \ + for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \ + __res = (a)[i] == (b); \ + if (__res) \ + break; \ + } \ + __res; \ + }) + +static __always_inline __u32 xdp_get_err_key(int err) +{ + switch (err) { + case 0: + return 0; + case -EINVAL: + return 2; + case -ENETDOWN: + return 3; + case -EMSGSIZE: + return 4; + case -EOPNOTSUPP: + return 5; + case -ENOSPC: + return 6; + default: + return 1; + } +} + +static __always_inline int xdp_redirect_collect_stat(int from, int err) +{ + u32 cpu = bpf_get_smp_processor_id(); + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + u32 idx; + + if (!IN_SET(from_match, from)) + return 0; + + key = xdp_get_err_key(err); + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&redir_err_cnt, &idx); + if (!rec) + return 0; + if (key) + NO_TEAR_INC(rec->dropped); + else + NO_TEAR_INC(rec->processed); + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tp_btf/xdp_redirect_err") +int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map_err") +int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect") +int BPF_PROG(tp_xdp_redirect, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map") +int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_cpumap_enqueue") +int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed, + unsigned int drops, int to_cpu) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 idx; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + idx = to_cpu * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + if (processed > 0) + NO_TEAR_INC(rec->issue); + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +SEC("tp_btf/xdp_cpumap_kthread") +int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, + unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) +{ + struct datarec *rec; + u32 cpu; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass); + NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop); + NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect); + /* Count times kthread yielded CPU via schedule call */ + if (sched) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_exception") +int BPF_PROG(tp_xdp_exception, const struct net_device *dev, + const struct bpf_prog *xdp, u32 act) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 key = act, idx; + + if (!IN_SET(from_match, dev->ifindex)) + return 0; + if (!IN_SET(to_match, dev->ifindex)) + return 0; + + if (key > XDP_REDIRECT) + key = XDP_REDIRECT + 1; + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&exception_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_INC(rec->dropped); + + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec *rec; + int idx_in, idx_out; + u32 cpu; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + NO_TEAR_INC(rec->info); + /* Record error cases, where no frame were sent */ + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec empty = {}; + struct datarec *rec; + int idx_in, idx_out; + u64 idx; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + idx = idx_in; + idx = idx << 32 | idx_out; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx); + if (!rec) + return 0; + + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_INC(rec->info); + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} diff --git a/samples/bpf/xdp_sample.bpf.h b/samples/bpf/xdp_sample.bpf.h new file mode 100644 index 000000000000..fecc41c5df04 --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.h @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _XDP_SAMPLE_BPF_H +#define _XDP_SAMPLE_BPF_H + +#include "vmlinux.h" +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +#include "net_shared.h" +#include "xdp_sample_shared.h" + +#define EINVAL 22 +#define ENETDOWN 100 +#define EMSGSIZE 90 +#define EOPNOTSUPP 95 +#define ENOSPC 28 + +typedef struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, unsigned int); + __type(value, struct datarec); +} array_map; + +extern array_map rx_cnt; +extern const volatile int nr_cpus; + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +/* + * Note: including linux/compiler.h or linux/kernel.h for the macros below + * conflicts with vmlinux.h include in BPF files, so we define them here. + * + * Following functions are taken from kernel sources and + * break aliasing rules in their original form. + * + * While kernel is compiled with -fno-strict-aliasing, + * perf uses -Wstrict-aliasing=3 which makes build fail + * under gcc 4.4. + * + * Using extra __may_alias__ type to allow aliasing + * in this case. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)res, (const void *)p, size); + asm volatile ("" : : : "memory"); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)p, (const void *)res, size); + asm volatile ("" : : : "memory"); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* Add a value using relaxed read and relaxed write. Less expensive than + * fetch_add when there is no write concurrency. + */ +#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val)) +#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#endif diff --git a/samples/bpf/xdp_sample_shared.h b/samples/bpf/xdp_sample_shared.h new file mode 100644 index 000000000000..8a7669a5d563 --- /dev/null +++ b/samples/bpf/xdp_sample_shared.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _XDP_SAMPLE_SHARED_H +#define _XDP_SAMPLE_SHARED_H + +struct datarec { + size_t processed; + size_t dropped; + size_t issue; + union { + size_t xdp_pass; + size_t info; + }; + size_t xdp_drop; + size_t xdp_redirect; +} __attribute__((aligned(64))); + +#endif diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c new file mode 100644 index 000000000000..158682852162 --- /dev/null +++ b/samples/bpf/xdp_sample_user.c @@ -0,0 +1,1673 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <linux/ethtool.h> +#include <linux/hashtable.h> +#include <linux/if_link.h> +#include <linux/jhash.h> +#include <linux/limits.h> +#include <linux/list.h> +#include <linux/sockios.h> +#include <locale.h> +#include <math.h> +#include <net/if.h> +#include <poll.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/signalfd.h> +#include <sys/sysinfo.h> +#include <sys/timerfd.h> +#include <sys/utsname.h> +#include <time.h> +#include <unistd.h> + +#include "bpf_util.h" +#include "xdp_sample_user.h" + +#define __sample_print(fmt, cond, ...) \ + ({ \ + if (cond) \ + printf(fmt, ##__VA_ARGS__); \ + }) + +#define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__) +#define print_default(fmt, ...) \ + __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__) +#define __print_err(err, fmt, ...) \ + ({ \ + __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT, \ + ##__VA_ARGS__); \ + sample_err_exp = sample_err_exp ? true : err > 0; \ + }) +#define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__) + +#define __COLUMN(x) "%'10" x " %-13s" +#define FMT_COLUMNf __COLUMN(".0f") +#define FMT_COLUMNd __COLUMN("d") +#define FMT_COLUMNl __COLUMN("llu") +#define RX(rx) rx, "rx/s" +#define PPS(pps) pps, "pkt/s" +#define DROP(drop) drop, "drop/s" +#define ERR(err) err, "error/s" +#define HITS(hits) hits, "hit/s" +#define XMIT(xmit) xmit, "xmit/s" +#define PASS(pass) pass, "pass/s" +#define REDIR(redir) redir, "redir/s" +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ + +#define XDP_UNKNOWN (XDP_REDIRECT + 1) +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +#define XDP_REDIRECT_ERR_MAX 7 + +enum map_type { + MAP_RX, + MAP_REDIRECT_ERR, + MAP_CPUMAP_ENQUEUE, + MAP_CPUMAP_KTHREAD, + MAP_EXCEPTION, + MAP_DEVMAP_XMIT, + MAP_DEVMAP_XMIT_MULTI, + NUM_MAP, +}; + +enum log_level { + LL_DEFAULT = 1U << 0, + LL_SIMPLE = 1U << 1, + LL_DEBUG = 1U << 2, +}; + +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; + +struct map_entry { + struct hlist_node node; + __u64 pair; + struct record val; +}; + +struct stats_record { + struct record rx_cnt; + struct record redir_err[XDP_REDIRECT_ERR_MAX]; + struct record kthread; + struct record exception[XDP_ACTION_MAX]; + struct record devmap_xmit; + DECLARE_HASHTABLE(xmit_map, 5); + struct record enq[]; +}; + +struct sample_output { + struct { + __u64 rx; + __u64 redir; + __u64 drop; + __u64 drop_xmit; + __u64 err; + __u64 xmit; + } totals; + struct { + union { + __u64 pps; + __u64 num; + }; + __u64 drop; + __u64 err; + } rx_cnt; + struct { + __u64 suc; + __u64 err; + } redir_cnt; + struct { + __u64 hits; + } except_cnt; + struct { + __u64 pps; + __u64 drop; + __u64 err; + double bavg; + } xmit_cnt; +}; + +struct xdp_desc { + int ifindex; + __u32 prog_id; + int flags; +} sample_xdp_progs[32]; + +struct datarec *sample_mmap[NUM_MAP]; +struct bpf_map *sample_map[NUM_MAP]; +size_t sample_map_count[NUM_MAP]; +enum log_level sample_log_level; +struct sample_output sample_out; +unsigned long sample_interval; +bool sample_err_exp; +int sample_xdp_cnt; +int sample_n_cpus; +int sample_sig_fd; +int sample_mask; + +static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = { + /* Key=1 keeps unknown errors */ + "Success", + "Unknown", + "EINVAL", + "ENETDOWN", + "EMSGSIZE", + "EOPNOTSUPP", + "ENOSPC", +}; + +/* Keyed from Unknown */ +static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = { + "Unknown error", + "Invalid redirection", + "Device being redirected to is down", + "Packet length too large for device", + "Operation not supported", + "No space in ptr_ring of cpumap kthread", +}; + +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; + +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + return UINT64_MAX; + } + return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +static void sample_print_help(int mask) +{ + printf("Output format description\n\n" + "By default, redirect success statistics are disabled, use -s to enable.\n" + "The terse output mode is default, verbose mode can be activated using -v\n" + "Use SIGQUIT (Ctrl + \\) to switch the mode dynamically at runtime\n\n" + "Terse mode displays at most the following fields:\n" + " rx/s Number of packets received per second\n" + " redir/s Number of packets successfully redirected per second\n" + " err,drop/s Aggregated count of errors per second (including dropped packets)\n" + " xmit/s Number of packets transmitted on the output device per second\n\n" + "Output description for verbose mode:\n" + " FIELD DESCRIPTION\n"); + + if (mask & SAMPLE_RX_CNT) { + printf(" receive\t\tDisplays the number of packets received & errors encountered\n" + " \t\t\tWhenever an error or packet drop occurs, details of per CPU error\n" + " \t\t\tand drop statistics will be expanded inline in terse mode.\n" + " \t\t\t\tpkt/s - Packets received per second\n" + " \t\t\t\tdrop/s - Packets dropped per second\n" + " \t\t\t\terror/s - Errors encountered per second\n\n"); + } + if (mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + printf(" redirect\t\tDisplays the number of packets successfully redirected\n" + " \t\t\tErrors encountered are expanded under redirect_err field\n" + " \t\t\tNote that passing -s to enable it has a per packet overhead\n" + " \t\t\t\tredir/s - Packets redirected successfully per second\n\n" + " redirect_err\t\tDisplays the number of packets that failed redirection\n" + " \t\t\tThe errno is expanded under this field with per CPU count\n" + " \t\t\tThe recognized errors are:\n"); + + for (int i = 2; i < XDP_REDIRECT_ERR_MAX; i++) + printf("\t\t\t %s: %s\n", xdp_redirect_err_names[i], + xdp_redirect_err_help[i - 1]); + + printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n"); + } + + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + printf(" enqueue to cpu N\tDisplays the number of packets enqueued to bulk queue of CPU N\n" + " \t\t\tExpands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N\n" + " \t\t\tReceived packets can be associated with the CPU redirect program is enqueuing \n" + " \t\t\tpackets to.\n" + " \t\t\t\tpkt/s - Packets enqueued per second from other CPU to CPU N\n" + " \t\t\t\tdrop/s - Packets dropped when trying to enqueue to CPU N\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + printf(" kthread\t\tDisplays the number of packets processed in CPUMAP kthread for each CPU\n" + " \t\t\tPackets consumed from ptr_ring in kthread, and its xdp_stats (after calling \n" + " \t\t\tCPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and\n" + " \t\t\tthen per-CPU to associate it to each CPU's pinned CPUMAP kthread.\n" + " \t\t\t\tpkt/s - Packets consumed per second from ptr_ring\n" + " \t\t\t\tdrop/s - Packets dropped per second in kthread\n" + " \t\t\t\tsched - Number of times kthread called schedule()\n\n" + " \t\t\txdp_stats (also expands to per-CPU counts)\n" + " \t\t\t\tpass/s - XDP_PASS count for CPUMAP program execution\n" + " \t\t\t\tdrop/s - XDP_DROP count for CPUMAP program execution\n" + " \t\t\t\tredir/s - XDP_REDIRECT count for CPUMAP program execution\n\n"); + } + + if (mask & SAMPLE_EXCEPTION_CNT) { + printf(" xdp_exception\t\tDisplays xdp_exception tracepoint events\n" + " \t\t\tThis can occur due to internal driver errors, unrecognized\n" + " \t\t\tXDP actions and due to explicit user trigger by use of XDP_ABORTED\n" + " \t\t\tEach action is expanded below this field with its count\n" + " \t\t\t\thit/s - Number of times the tracepoint was hit per second\n\n"); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + printf(" devmap_xmit\t\tDisplays devmap_xmit tracepoint events\n" + " \t\t\tThis tracepoint is invoked for successful transmissions on output\n" + " \t\t\tdevice but these statistics are not available for generic XDP mode,\n" + " \t\t\thence they will be omitted from the output when using SKB mode\n" + " \t\t\t\txmit/s - Number of packets that were transmitted per second\n" + " \t\t\t\tdrop/s - Number of packets that failed transmissions per second\n" + " \t\t\t\tdrv_err/s - Number of internal driver errors per second\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } +} + +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error) +{ + int i; + + if (!error) + sample_print_help(mask); + + printf("\n%s\nOption for %s:\n", doc, argv[0]); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-15s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value: %d)", + *long_options[i].flag); + else + printf("\t short-option: -%c", long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + struct datarec *array; + + array = calloc(nr_cpus, sizeof(*array)); + if (!array) { + fprintf(stderr, "Failed to allocate memory (nr_cpus: %u)\n", + nr_cpus); + return NULL; + } + return array; +} + +static int map_entry_init(struct map_entry *e, __u64 pair) +{ + e->pair = pair; + INIT_HLIST_NODE(&e->node); + e->val.timestamp = gettime(); + e->val.cpu = alloc_record_per_cpu(); + if (!e->val.cpu) + return -ENOMEM; + return 0; +} + +static void map_collect_percpu(struct datarec *values, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + __u64 sum_xdp_redirect = 0; + __u64 sum_processed = 0; + __u64 sum_xdp_pass = 0; + __u64 sum_xdp_drop = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = READ_ONCE(values[i].processed); + rec->cpu[i].dropped = READ_ONCE(values[i].dropped); + rec->cpu[i].issue = READ_ONCE(values[i].issue); + rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass); + rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop); + rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect); + + sum_processed += rec->cpu[i].processed; + sum_dropped += rec->cpu[i].dropped; + sum_issue += rec->cpu[i].issue; + sum_xdp_pass += rec->cpu[i].xdp_pass; + sum_xdp_drop += rec->cpu[i].xdp_drop; + sum_xdp_redirect += rec->cpu[i].xdp_redirect; + } + + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + rec->total.xdp_pass = sum_xdp_pass; + rec->total.xdp_drop = sum_xdp_drop; + rec->total.xdp_redirect = sum_xdp_redirect; +} + +static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u32 batch, count = 32; + struct datarec *values; + bool init = false; + __u64 *keys; + int i, ret; + + keys = calloc(count, sizeof(__u64)); + if (!keys) + return -ENOMEM; + values = calloc(count * nr_cpus, sizeof(struct datarec)); + if (!values) { + free(keys); + return -ENOMEM; + } + + for (;;) { + bool exit = false; + + ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch, + keys, values, &count, NULL); + if (ret < 0 && errno != ENOENT) + break; + if (errno == ENOENT) + exit = true; + + init = true; + for (i = 0; i < count; i++) { + struct map_entry *e, *x = NULL; + __u64 pair = keys[i]; + struct datarec *arr; + + arr = &values[i * nr_cpus]; + hash_for_each_possible(rec->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (!x) { + x = calloc(1, sizeof(*x)); + if (!x) + goto cleanup; + if (map_entry_init(x, pair) < 0) { + free(x); + goto cleanup; + } + hash_add(rec->xmit_map, &x->node, pair); + } + map_collect_percpu(arr, &x->val); + } + + if (exit) + break; + count = 32; + } + + free(values); + free(keys); + return 0; +cleanup: + free(values); + free(keys); + return -ENOMEM; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record)); + if (!rec) { + fprintf(stderr, "Failed to allocate memory\n"); + return NULL; + } + + if (sample_mask & SAMPLE_RX_CNT) { + rec->rx_cnt.cpu = alloc_record_per_cpu(); + if (!rec->rx_cnt.cpu) { + fprintf(stderr, + "Failed to allocate rx_cnt per-CPU array\n"); + goto end_rec; + } + } + if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) { + rec->redir_err[i].cpu = alloc_record_per_cpu(); + if (!rec->redir_err[i].cpu) { + fprintf(stderr, + "Failed to allocate redir_err per-CPU array for " + "\"%s\" case\n", + xdp_redirect_err_names[i]); + while (i--) + free(rec->redir_err[i].cpu); + goto end_rx_cnt; + } + } + } + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + rec->kthread.cpu = alloc_record_per_cpu(); + if (!rec->kthread.cpu) { + fprintf(stderr, + "Failed to allocate kthread per-CPU array\n"); + goto end_redir; + } + } + if (sample_mask & SAMPLE_EXCEPTION_CNT) { + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->exception[i].cpu = alloc_record_per_cpu(); + if (!rec->exception[i].cpu) { + fprintf(stderr, + "Failed to allocate exception per-CPU array for " + "\"%s\" case\n", + action2str(i)); + while (i--) + free(rec->exception[i].cpu); + goto end_kthread; + } + } + } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) { + rec->devmap_xmit.cpu = alloc_record_per_cpu(); + if (!rec->devmap_xmit.cpu) { + fprintf(stderr, + "Failed to allocate devmap_xmit per-CPU array\n"); + goto end_exception; + } + } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + hash_init(rec->xmit_map); + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + for (i = 0; i < sample_n_cpus; i++) { + rec->enq[i].cpu = alloc_record_per_cpu(); + if (!rec->enq[i].cpu) { + fprintf(stderr, + "Failed to allocate enqueue per-CPU array for " + "CPU %d\n", + i); + while (i--) + free(rec->enq[i].cpu); + goto end_devmap_xmit; + } + } + } + + return rec; + +end_devmap_xmit: + free(rec->devmap_xmit.cpu); +end_exception: + for (i = 0; i < XDP_ACTION_MAX; i++) + free(rec->exception[i].cpu); +end_kthread: + free(rec->kthread.cpu); +end_redir: + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(rec->redir_err[i].cpu); +end_rx_cnt: + free(rec->rx_cnt.cpu); +end_rec: + free(rec); + return NULL; +} + +static void free_stats_record(struct stats_record *r) +{ + struct hlist_node *tmp; + struct map_entry *e; + int i; + + for (i = 0; i < sample_n_cpus; i++) + free(r->enq[i].cpu); + hash_for_each_safe(r->xmit_map, i, tmp, e, node) { + hash_del(&e->node); + free(e->val.cpu); + free(e); + } + free(r->devmap_xmit.cpu); + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->exception[i].cpu); + free(r->kthread.cpu); + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(r->redir_err[i].cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double)period / NANOSEC_PER_SEC); + + return period_; +} + +static double sample_round(double val) +{ + if (val - floor(val) < 0.5) + return floor(val); + return ceil(val); +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->info - p->info; + pps = sample_round(packets / period_); + } + return pps; +} + +static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass, + double *xdp_drop, double *xdp_redirect, double period_) +{ + *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; + if (period_ > 0) { + *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; + *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; + *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; + } +} + +static void stats_get_rx_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), ERR(err)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->rx_cnt.pps = pps; + out->rx_cnt.drop = drop; + out->rx_cnt.err = err; + out->totals.rx += pps; + out->totals.drop += drop; + out->totals.err += err; + } +} + +static void stats_get_cpumap_enqueue(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i, to_cpu; + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) { + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + if (pps > 0 || drop > 0) { + char str[64]; + + snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu); + + if (err > 0) + err = pps / err; /* calc average bulk size */ + + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu); + if (err > 0) + err = pps / err; /* calc average bulk size */ + print_default( + " %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + } +} + +static void stats_get_cpumap_remote(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + double xdp_pass, xdp_drop, xdp_redirect; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, + &xdp_redirect, t); + if (xdp_pass || xdp_drop || xdp_redirect) { + print_err(xdp_drop, + " %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + "xdp_stats", PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t); + if (!xdp_pass && !xdp_drop && !xdp_redirect) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } +} + +static void stats_get_cpumap_kthread(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err, + "sched"); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), err, "sched"); + } +} + +static void stats_get_redirect_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps; + int i; + + rec = &stats_rec->redir_err[0]; + prev = &stats_prev->redir_err[0]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + if (!pps) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf "\n", str, REDIR(pps)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + out->redir_cnt.suc = pps; + out->totals.redir += pps; + } +} + +static void stats_get_redirect_err_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, drop, sum = 0; + int rec_i, i; + + for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) { + char str[64]; + + rec = &stats_rec->redir_err[rec_i]; + prev = &stats_prev->redir_err[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + if (drop > 0 && !out) { + snprintf(str, sizeof(str), + sample_log_level & LL_DEFAULT ? "%s total" : + "%s", + xdp_redirect_err_names[rec_i]); + print_err(drop, " %-18s " FMT_COLUMNf "\n", str, + ERR(drop)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", str, + ERR(drop)); + } + + sum += drop; + } + + if (out) { + out->redir_cnt.err = sum; + out->totals.err += sum; + } +} + +static void stats_get_exception_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double t, drop, sum = 0; + struct record *rec, *prev; + int rec_i, i; + + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + rec = &stats_rec->exception[rec_i]; + prev = &stats_prev->exception[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + /* Fold out errors after heading */ + sum += drop; + + if (drop > 0 && !out) { + print_always(" %-18s " FMT_COLUMNf "\n", + action2str(rec_i), ERR(drop)); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", + str, ERR(drop)); + } + } + } + + if (out) { + out->except_cnt.hits = sum; + out->totals.err += sum; + } +} + +static void stats_get_devmap_xmit(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double pps, drop, info, err; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->devmap_xmit; + prev = &stats_prev->devmap_xmit; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(r, p, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", + str, XMIT(pps), DROP(drop), err, "drv_err/s", + info, "bulk-avg"); + } + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + info = calc_info_pps(&rec->total, &prev->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->xmit_cnt.pps = pps; + out->xmit_cnt.drop = drop; + out->xmit_cnt.bavg = info; + out->xmit_cnt.err = err; + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + } +} + +static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out, + bool xmit_total) +{ + double pps, drop, info, err; + struct map_entry *entry; + struct record *r, *p; + double t; + int bkt; + + hash_for_each(stats_rec->xmit_map, bkt, entry, node) { + struct map_entry *e, *x = NULL; + char ifname_from[IFNAMSIZ]; + char ifname_to[IFNAMSIZ]; + const char *fstr, *tstr; + unsigned long prev_time; + struct record beg = {}; + __u32 from_idx, to_idx; + char str[128]; + __u64 pair; + int i; + + prev_time = sample_interval * NANOSEC_PER_SEC; + + pair = entry->pair; + from_idx = pair >> 32; + to_idx = pair & 0xFFFFFFFF; + + r = &entry->val; + beg.timestamp = r->timestamp - prev_time; + + /* Find matching entry from stats_prev map */ + hash_for_each_possible(stats_prev->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (x) + p = &x->val; + else + p = &beg; + t = calc_period(r, p); + pps = calc_pps(&r->total, &p->total, t); + drop = calc_drop_pps(&r->total, &p->total, t); + info = calc_info_pps(&r->total, &p->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&r->total, &p->total, t); + + if (out) { + /* We are responsible for filling out totals */ + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + continue; + } + + fstr = tstr = NULL; + if (if_indextoname(from_idx, ifname_from)) + fstr = ifname_from; + if (if_indextoname(to_idx, ifname_to)) + tstr = ifname_to; + + snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?", + tstr ?: "?"); + /* Skip idle streams of redirection */ + if (pps || drop || err) { + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), + err, "drv_err/s", info, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *rc = &r->cpu[i]; + struct datarec *pc, p_beg = {}; + char str[64]; + + pc = p == &beg ? &p_beg : &p->cpu[i]; + + pps = calc_pps(rc, pc, t); + drop = calc_drop_pps(rc, pc, t); + err = calc_errs_pps(rc, pc, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(rc, pc, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), + DROP(drop), err, "drv_err/s", info, "bulk-avg"); + } + } +} + +static void stats_print(const char *prefix, int mask, struct stats_record *r, + struct stats_record *p, struct sample_output *out) +{ + int nr_cpus = libbpf_num_possible_cpus(); + const char *str; + + print_always("%-23s", prefix ?: "Summary"); + if (mask & SAMPLE_RX_CNT) + print_always(FMT_COLUMNl, RX(out->totals.rx)); + if (mask & SAMPLE_REDIRECT_CNT) + print_always(FMT_COLUMNl, REDIR(out->totals.redir)); + printf(FMT_COLUMNl, + out->totals.err + out->totals.drop + out->totals.drop_xmit, + "err,drop/s"); + if (mask & SAMPLE_DEVMAP_XMIT_CNT || + mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + printf(FMT_COLUMNl, XMIT(out->totals.xmit)); + printf("\n"); + + if (mask & SAMPLE_RX_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ? + "receive total" : + "receive"; + print_err((out->rx_cnt.err || out->rx_cnt.drop), + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n", + str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop), + ERR(out->rx_cnt.err)); + + stats_get_rx_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + stats_get_cpumap_enqueue(r, p, nr_cpus); + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + stats_get_cpumap_kthread(r, p, nr_cpus); + stats_get_cpumap_remote(r, p, nr_cpus); + } + + if (mask & SAMPLE_REDIRECT_CNT) { + str = out->redir_cnt.suc ? "redirect total" : "redirect"; + print_default(" %-20s " FMT_COLUMNl "\n", str, + REDIR(out->redir_cnt.suc)); + + stats_get_redirect_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_REDIRECT_ERR_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ? + "redirect_err total" : + "redirect_err"; + print_err(out->redir_cnt.err, " %-20s " FMT_COLUMNl "\n", str, + ERR(out->redir_cnt.err)); + + stats_get_redirect_err_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_EXCEPTION_CNT) { + str = out->except_cnt.hits ? "xdp_exception total" : + "xdp_exception"; + + print_err(out->except_cnt.hits, " %-20s " FMT_COLUMNl "\n", str, + HITS(out->except_cnt.hits)); + + stats_get_exception_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ? + "devmap_xmit total" : + "devmap_xmit"; + + print_err(out->xmit_cnt.err || out->xmit_cnt.drop, + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl + __COLUMN(".2f") "\n", + str, XMIT(out->xmit_cnt.pps), + DROP(out->xmit_cnt.drop), out->xmit_cnt.err, + "drv_err/s", out->xmit_cnt.bavg, "bulk-avg"); + + stats_get_devmap_xmit(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL, + mask & SAMPLE_DEVMAP_XMIT_CNT); + + if (sample_log_level & LL_DEFAULT || + ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { + sample_err_exp = false; + printf("\n"); + } +} + +int sample_setup_maps(struct bpf_map **maps) +{ + sample_n_cpus = libbpf_num_possible_cpus(); + + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { + sample_map[i] = maps[i]; + + switch (i) { + case MAP_RX: + case MAP_CPUMAP_KTHREAD: + case MAP_DEVMAP_XMIT: + sample_map_count[i] = sample_n_cpus; + break; + case MAP_REDIRECT_ERR: + sample_map_count[i] = + XDP_REDIRECT_ERR_MAX * sample_n_cpus; + break; + case MAP_EXCEPTION: + sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus; + case MAP_CPUMAP_ENQUEUE: + sample_map_count[i] = sample_n_cpus * sample_n_cpus; + break; + default: + return -EINVAL; + } + if (bpf_map__set_max_entries(sample_map[i], sample_map_count[i]) < 0) + return -errno; + } + sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI]; + return 0; +} + +static int sample_setup_maps_mappings(void) +{ + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { + size_t size = sample_map_count[i] * sizeof(struct datarec); + + sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map__fd(sample_map[i]), 0); + if (sample_mmap[i] == MAP_FAILED) + return -errno; + } + return 0; +} + +int __sample_init(int mask) +{ + sigset_t st; + + sigemptyset(&st); + sigaddset(&st, SIGQUIT); + sigaddset(&st, SIGINT); + sigaddset(&st, SIGTERM); + + if (sigprocmask(SIG_BLOCK, &st, NULL) < 0) + return -errno; + + sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK); + if (sample_sig_fd < 0) + return -errno; + + sample_mask = mask; + + return sample_setup_maps_mappings(); +} + +static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags) +{ + __u32 cur_prog_id = 0; + int ret; + + if (prog_id) { + ret = bpf_xdp_query_id(ifindex, xdp_flags, &cur_prog_id); + if (ret < 0) + return -errno; + + if (prog_id != cur_prog_id) { + print_always( + "Program on ifindex %d does not match installed " + "program, skipping unload\n", + ifindex); + return -ENOENT; + } + } + + return bpf_xdp_detach(ifindex, xdp_flags, NULL); +} + +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force) +{ + int ret, xdp_flags = 0; + __u32 prog_id = 0; + + if (sample_xdp_cnt == 32) { + fprintf(stderr, + "Total limit for installed XDP programs in a sample reached\n"); + return -ENOTSUP; + } + + xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0; + xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; + ret = bpf_xdp_attach(ifindex, bpf_program__fd(xdp_prog), xdp_flags, NULL); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to install program \"%s\" on ifindex %d, mode = %s, " + "force = %s: %s\n", + bpf_program__name(xdp_prog), ifindex, + generic ? "skb" : "native", force ? "true" : "false", + strerror(-ret)); + return ret; + } + + ret = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to get XDP program id for ifindex %d, removing program: %s\n", + ifindex, strerror(errno)); + __sample_remove_xdp(ifindex, 0, xdp_flags); + return ret; + } + sample_xdp_progs[sample_xdp_cnt++] = + (struct xdp_desc){ ifindex, prog_id, xdp_flags }; + + return 0; +} + +static void sample_summary_print(void) +{ + double num = sample_out.rx_cnt.num; + + if (sample_out.totals.rx) { + double pkts = sample_out.totals.rx; + + print_always(" Packets received : %'-10llu\n", + sample_out.totals.rx); + print_always(" Average packets/s : %'-10.0f\n", + sample_round(pkts / num)); + } + if (sample_out.totals.redir) { + double pkts = sample_out.totals.redir; + + print_always(" Packets redirected : %'-10llu\n", + sample_out.totals.redir); + print_always(" Average redir/s : %'-10.0f\n", + sample_round(pkts / num)); + } + if (sample_out.totals.drop) + print_always(" Rx dropped : %'-10llu\n", + sample_out.totals.drop); + if (sample_out.totals.drop_xmit) + print_always(" Tx dropped : %'-10llu\n", + sample_out.totals.drop_xmit); + if (sample_out.totals.err) + print_always(" Errors recorded : %'-10llu\n", + sample_out.totals.err); + if (sample_out.totals.xmit) { + double pkts = sample_out.totals.xmit; + + print_always(" Packets transmitted : %'-10llu\n", + sample_out.totals.xmit); + print_always(" Average transmit/s : %'-10.0f\n", + sample_round(pkts / num)); + } +} + +void sample_exit(int status) +{ + size_t size; + + for (int i = 0; i < NUM_MAP; i++) { + size = sample_map_count[i] * sizeof(**sample_mmap); + munmap(sample_mmap[i], size); + } + while (sample_xdp_cnt--) { + int i = sample_xdp_cnt, ifindex, xdp_flags; + __u32 prog_id; + + prog_id = sample_xdp_progs[i].prog_id; + ifindex = sample_xdp_progs[i].ifindex; + xdp_flags = sample_xdp_progs[i].flags; + + __sample_remove_xdp(ifindex, prog_id, xdp_flags); + } + sample_summary_print(); + close(sample_sig_fd); + exit(status); +} + +static int sample_stats_collect(struct stats_record *rec) +{ + int i; + + if (sample_mask & SAMPLE_RX_CNT) + map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt); + + if (sample_mask & SAMPLE_REDIRECT_CNT) + map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]); + + if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) { + for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus], + &rec->redir_err[i]); + } + + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + for (i = 0; i < sample_n_cpus; i++) + map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus], + &rec->enq[i]); + + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) + map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD], + &rec->kthread); + + if (sample_mask & SAMPLE_EXCEPTION_CNT) + for (i = 0; i < XDP_ACTION_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], + &rec->exception[i]); + + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) + map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit); + + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) { + if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0) + return -EINVAL; + } + return 0; +} + +static void sample_summary_update(struct sample_output *out) +{ + sample_out.totals.rx += out->totals.rx; + sample_out.totals.redir += out->totals.redir; + sample_out.totals.drop += out->totals.drop; + sample_out.totals.drop_xmit += out->totals.drop_xmit; + sample_out.totals.err += out->totals.err; + sample_out.totals.xmit += out->totals.xmit; + sample_out.rx_cnt.num++; +} + +static void sample_stats_print(int mask, struct stats_record *cur, + struct stats_record *prev, char *prog_name) +{ + struct sample_output out = {}; + + if (mask & SAMPLE_RX_CNT) + stats_get_rx_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_CNT) + stats_get_redirect_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_ERR_CNT) + stats_get_redirect_err_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_EXCEPTION_CNT) + stats_get_exception_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_DEVMAP_XMIT_CNT) + stats_get_devmap_xmit(cur, prev, 0, &out); + else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(cur, prev, 0, &out, + mask & SAMPLE_DEVMAP_XMIT_CNT); + sample_summary_update(&out); + + stats_print(prog_name, mask, cur, prev, &out); +} + +void sample_switch_mode(void) +{ + sample_log_level ^= LL_DEBUG - 1; +} + +static int sample_signal_cb(void) +{ + struct signalfd_siginfo si; + int r; + + r = read(sample_sig_fd, &si, sizeof(si)); + if (r < 0) + return -errno; + + switch (si.ssi_signo) { + case SIGQUIT: + sample_switch_mode(); + printf("\n"); + break; + default: + printf("\n"); + return 1; + } + + return 0; +} + +/* Pointer swap trick */ +static void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int sample_timer_cb(int timerfd, struct stats_record **rec, + struct stats_record **prev) +{ + char line[64] = "Summary"; + int ret; + __u64 t; + + ret = read(timerfd, &t, sizeof(t)); + if (ret < 0) + return -errno; + + swap(prev, rec); + ret = sample_stats_collect(*rec); + if (ret < 0) + return ret; + + if (sample_xdp_cnt == 2 && !(sample_mask & SAMPLE_SKIP_HEADING)) { + char fi[IFNAMSIZ]; + char to[IFNAMSIZ]; + const char *f, *t; + + f = t = NULL; + if (if_indextoname(sample_xdp_progs[0].ifindex, fi)) + f = fi; + if (if_indextoname(sample_xdp_progs[1].ifindex, to)) + t = to; + + snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?"); + } + + sample_stats_print(sample_mask, *rec, *prev, line); + return 0; +} + +int sample_run(int interval, void (*post_cb)(void *), void *ctx) +{ + struct timespec ts = { interval, 0 }; + struct itimerspec its = { ts, ts }; + struct stats_record *rec, *prev; + struct pollfd pfd[2] = {}; + int timerfd, ret; + + if (!interval) { + fprintf(stderr, "Incorrect interval 0\n"); + return -EINVAL; + } + sample_interval = interval; + /* Pretty print numbers */ + setlocale(LC_NUMERIC, "en_US.UTF-8"); + + timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + if (timerfd < 0) + return -errno; + timerfd_settime(timerfd, 0, &its, NULL); + + pfd[0].fd = sample_sig_fd; + pfd[0].events = POLLIN; + + pfd[1].fd = timerfd; + pfd[1].events = POLLIN; + + ret = -ENOMEM; + rec = alloc_stats_record(); + if (!rec) + goto end; + prev = alloc_stats_record(); + if (!prev) + goto end_rec; + + ret = sample_stats_collect(rec); + if (ret < 0) + goto end_rec_prev; + + for (;;) { + ret = poll(pfd, 2, -1); + if (ret < 0) { + if (errno == EINTR) + continue; + else + break; + } + + if (pfd[0].revents & POLLIN) + ret = sample_signal_cb(); + else if (pfd[1].revents & POLLIN) + ret = sample_timer_cb(timerfd, &rec, &prev); + + if (ret) + break; + + if (post_cb) + post_cb(ctx); + } + +end_rec_prev: + free_stats_record(prev); +end_rec: + free_stats_record(rec); +end: + close(timerfd); + + return ret; +} + +const char *get_driver_name(int ifindex) +{ + struct ethtool_drvinfo drv = {}; + char ifname[IF_NAMESIZE]; + static char drvname[32]; + struct ifreq ifr = {}; + int fd, r = 0; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return "[error]"; + + if (!if_indextoname(ifindex, ifname)) + goto end; + + drv.cmd = ETHTOOL_GDRVINFO; + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + ifr.ifr_data = (void *)&drv; + + r = ioctl(fd, SIOCETHTOOL, &ifr); + if (r) + goto end; + + safe_strncpy(drvname, drv.driver, sizeof(drvname)); + + close(fd); + return drvname; + +end: + r = errno; + close(fd); + return r == EOPNOTSUPP ? "loopback" : "[error]"; +} + +int get_mac_addr(int ifindex, void *mac_addr) +{ + char ifname[IF_NAMESIZE]; + struct ifreq ifr = {}; + int fd, r; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + if (!if_indextoname(ifindex, ifname)) { + r = -errno; + goto end; + } + + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + r = ioctl(fd, SIOCGIFHWADDR, &ifr); + if (r) { + r = -errno; + goto end; + } + + memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); + +end: + close(fd); + return r; +} + +__attribute__((constructor)) static void sample_ctor(void) +{ + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + /* Just exit, nothing to cleanup right now */ + exit(EXIT_FAIL_BPF); + } +} diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h new file mode 100644 index 000000000000..f45051679977 --- /dev/null +++ b/samples/bpf/xdp_sample_user.h @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef XDP_SAMPLE_USER_H +#define XDP_SAMPLE_USER_H + +#include <bpf/libbpf.h> +#include <linux/compiler.h> + +#include "xdp_sample_shared.h" + +enum stats_mask { + _SAMPLE_REDIRECT_MAP = 1U << 0, + SAMPLE_RX_CNT = 1U << 1, + SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, + SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, + SAMPLE_EXCEPTION_CNT = 1U << 5, + SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, + SAMPLE_REDIRECT_CNT = 1U << 7, + SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8, + SAMPLE_SKIP_HEADING = 1U << 9, +}; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +int sample_setup_maps(struct bpf_map **maps); +int __sample_init(int mask); +void sample_exit(int status); +int sample_run(int interval, void (*post_cb)(void *), void *ctx); + +void sample_switch_mode(void); +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force); +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error); + +const char *get_driver_name(int ifindex); +int get_mac_addr(int ifindex, void *mac_addr); + +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif +__attribute__((unused)) +static inline char *safe_strncpy(char *dst, const char *src, size_t size) +{ + if (!size) + return dst; + strncpy(dst, src, size - 1); + dst[size - 1] = '\0'; + return dst; +} +#pragma GCC diagnostic pop + +#define __attach_tp(name) \ + ({ \ + if (bpf_program__type(skel->progs.name) != BPF_PROG_TYPE_TRACING)\ + return -EINVAL; \ + skel->links.name = bpf_program__attach(skel->progs.name); \ + if (!skel->links.name) \ + return -errno; \ + }) + +#define sample_init_pre_load(skel) \ + ({ \ + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); \ + sample_setup_maps((struct bpf_map *[]){ \ + skel->maps.rx_cnt, skel->maps.redir_err_cnt, \ + skel->maps.cpumap_enqueue_cnt, \ + skel->maps.cpumap_kthread_cnt, \ + skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt, \ + skel->maps.devmap_xmit_cnt_multi }); \ + }) + +#define DEFINE_SAMPLE_INIT(name) \ + static int sample_init(struct name *skel, int mask) \ + { \ + int ret; \ + ret = __sample_init(mask); \ + if (ret < 0) \ + return ret; \ + if (mask & SAMPLE_REDIRECT_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map); \ + if (mask & SAMPLE_REDIRECT_CNT) \ + __attach_tp(tp_xdp_redirect); \ + if (mask & SAMPLE_REDIRECT_ERR_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map_err); \ + if (mask & SAMPLE_REDIRECT_ERR_CNT) \ + __attach_tp(tp_xdp_redirect_err); \ + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) \ + __attach_tp(tp_xdp_cpumap_enqueue); \ + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) \ + __attach_tp(tp_xdp_cpumap_kthread); \ + if (mask & SAMPLE_EXCEPTION_CNT) \ + __attach_tp(tp_xdp_exception); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT) \ + __attach_tp(tp_xdp_devmap_xmit); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) \ + __attach_tp(tp_xdp_devmap_xmit_multi); \ + return 0; \ + } + +#endif diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h index dd12cc35110f..be839892caff 100644 --- a/samples/bpf/xdp_tx_iptunnel_common.h +++ b/samples/bpf/xdp_tx_iptunnel_common.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H #define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c index 0f4f6e8c8611..0e2bca3a3fff 100644 --- a/samples/bpf/xdp_tx_iptunnel_kern.c +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -16,22 +16,22 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include "bpf_helpers.h" +#include <bpf/bpf_helpers.h> #include "xdp_tx_iptunnel_common.h" -struct bpf_map_def SEC("maps") rxcnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(__u64), - .max_entries = 256, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 256); +} rxcnt SEC(".maps"); -struct bpf_map_def SEC("maps") vip2tnl = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct vip), - .value_size = sizeof(struct iptnl_info), - .max_entries = MAX_IPTNL_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct vip); + __type(value, struct iptnl_info); + __uint(max_entries, MAX_IPTNL_ENTRIES); +} vip2tnl SEC(".maps"); static __always_inline void count_tx(u32 protocol) { @@ -212,7 +212,7 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp) return XDP_TX; } -SEC("xdp_tx_iptunnel") +SEC("xdp.frags") int _xdp_tx_iptunnel(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 715cd12eaca5..7e4b2f7108a6 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/bpf.h> #include <linux/if_link.h> @@ -12,25 +9,39 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/resource.h> +#include <net/if.h> #include <arpa/inet.h> #include <netinet/ether.h> #include <unistd.h> #include <time.h> -#include "bpf_load.h" -#include "libbpf.h" +#include <bpf/libbpf.h> +#include <bpf/bpf.h> #include "bpf_util.h" #include "xdp_tx_iptunnel_common.h" #define STATS_INTERVAL_S 2U static int ifindex = -1; -static __u32 xdp_flags = 0; +static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static int rxcnt_map_fd; +static __u32 prog_id; static void int_exit(int sig) { - if (ifindex > -1) - set_link_xdp_fd(ifindex, -1, xdp_flags); + __u32 curr_prog_id = 0; + + if (ifindex > -1) { + if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) { + printf("bpf_xdp_query_id failed\n"); + exit(1); + } + if (prog_id == curr_prog_id) + bpf_xdp_detach(ifindex, xdp_flags, NULL); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given iface\n"); + else + printf("program on interface changed, not removing\n"); + } exit(0); } @@ -53,7 +64,8 @@ static void poll_stats(unsigned int kill_after_s) for (proto = 0; proto < nr_protos; proto++) { __u64 sum = 0; - assert(bpf_map_lookup_elem(map_fd[0], &proto, values) == 0); + assert(bpf_map_lookup_elem(rxcnt_map_fd, &proto, + values) == 0); for (i = 0; i < nr_cpus; i++) sum += (values[i] - prev[proto][i]); @@ -71,7 +83,7 @@ static void usage(const char *cmd) "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n" "is used to select packets to encapsulate\n\n"); printf("Usage: %s [...]\n", cmd); - printf(" -i <ifindex> Interface Index\n"); + printf(" -i <ifname|ifindex> Interface\n"); printf(" -a <vip-service-address> IPv4 or IPv6\n"); printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n"); printf(" -s <source-ip> Used in the IPTunnel header\n"); @@ -81,6 +93,7 @@ static void usage(const char *cmd) printf(" -P <IP-Protocol> Default is TCP\n"); printf(" -S use skb-mode\n"); printf(" -N enforce native mode\n"); + printf(" -F Force loading the XDP prog\n"); printf(" -h Display this help\n"); } @@ -138,16 +151,19 @@ static int parse_ports(const char *port_str, int *min_port, int *max_port) int main(int argc, char **argv) { + int min_port = 0, max_port = 0, vip2tnl_map_fd; + const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; unsigned char opt_flags[256] = {}; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); unsigned int kill_after_s = 0; - const char *optstr = "i:a:p:s:d:m:T:P:SNh"; - int min_port = 0, max_port = 0; struct iptnl_info tnl = {}; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + struct bpf_object *obj; struct vip vip = {}; char filename[256]; - int opt; - int i; + int opt, prog_fd; + int i, err; tnl.family = AF_UNSPEC; vip.protocol = IPPROTO_TCP; @@ -162,7 +178,9 @@ int main(int argc, char **argv) switch (opt) { case 'i': - ifindex = atoi(optarg); + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); break; case 'a': vip.family = parse_ipstr(optarg, vip.daddr.v6); @@ -209,7 +227,10 @@ int main(int argc, char **argv) xdp_flags |= XDP_FLAGS_SKB_MODE; break; case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; + /* default, set below */ + break; + case 'F': + xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; default: usage(argv[0]); @@ -218,6 +239,9 @@ int main(int argc, char **argv) opt_flags[opt] = 0; } + if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) + xdp_flags |= XDP_FLAGS_DRV_MODE; + for (i = 0; i < strlen(optstr); i++) { if (opt_flags[(unsigned int)optstr[i]]) { fprintf(stderr, "Missing argument -%c\n", optstr[i]); @@ -226,20 +250,31 @@ int main(int argc, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); return 1; } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) + return 1; + + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) { + printf("bpf_object__load(): %s\n", strerror(errno)); return 1; } + prog_fd = bpf_program__fd(prog); - if (!prog_fd[0]) { - printf("load_bpf_file: %s\n", strerror(errno)); + rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); + vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl"); + if (vip2tnl_map_fd < 0 || rxcnt_map_fd < 0) { + printf("bpf_object__find_map_fd_by_name failed\n"); return 1; } @@ -248,20 +283,28 @@ int main(int argc, char **argv) while (min_port <= max_port) { vip.dport = htons(min_port++); - if (bpf_map_update_elem(map_fd[1], &vip, &tnl, BPF_NOEXIST)) { + if (bpf_map_update_elem(vip2tnl_map_fd, &vip, &tnl, + BPF_NOEXIST)) { perror("bpf_map_update_elem(&vip2tnl)"); return 1; } } - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { printf("link set xdp fd failed\n"); return 1; } + err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); + if (err) { + printf("can't get prog info - %s\n", strerror(errno)); + return err; + } + prog_id = info.id; + poll_stats(kill_after_s); - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_xdp_detach(ifindex, xdp_flags, NULL); return 0; } diff --git a/samples/cgroup/.gitignore b/samples/cgroup/.gitignore new file mode 100644 index 000000000000..3a0161194cce --- /dev/null +++ b/samples/cgroup/.gitignore @@ -0,0 +1,3 @@ +/cgroup_event_listener +/memcg_event_listener + diff --git a/samples/cgroup/Makefile b/samples/cgroup/Makefile new file mode 100644 index 000000000000..526c8569707c --- /dev/null +++ b/samples/cgroup/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +userprogs-always-y += cgroup_event_listener memcg_event_listener + +userccflags += -I usr/include diff --git a/samples/cgroup/cgroup_event_listener.c b/samples/cgroup/cgroup_event_listener.c new file mode 100644 index 000000000000..3d70dc831a76 --- /dev/null +++ b/samples/cgroup/cgroup_event_listener.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cgroup_event_listener.c - Simple listener of cgroup events + * + * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name> + */ + +#include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <limits.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> + +#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>" + +int main(int argc, char **argv) +{ + int efd = -1; + int cfd = -1; + int event_control = -1; + char event_control_path[PATH_MAX]; + char line[LINE_MAX]; + int ret; + + if (argc != 3) + errx(1, "%s", USAGE_STR); + + cfd = open(argv[1], O_RDONLY); + if (cfd == -1) + err(1, "Cannot open %s", argv[1]); + + ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", + dirname(argv[1])); + if (ret >= PATH_MAX) + errx(1, "Path to cgroup.event_control is too long"); + + event_control = open(event_control_path, O_WRONLY); + if (event_control == -1) + err(1, "Cannot open %s", event_control_path); + + efd = eventfd(0, 0); + if (efd == -1) + err(1, "eventfd() failed"); + + ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); + if (ret >= LINE_MAX) + errx(1, "Arguments string is too long"); + + ret = write(event_control, line, strlen(line) + 1); + if (ret == -1) + err(1, "Cannot write to cgroup.event_control"); + + while (1) { + uint64_t result; + + ret = read(efd, &result, sizeof(result)); + if (ret == -1) { + if (errno == EINTR) + continue; + err(1, "Cannot read from eventfd"); + } + assert(ret == sizeof(result)); + + ret = access(event_control_path, W_OK); + if ((ret == -1) && (errno == ENOENT)) { + puts("The cgroup seems to have removed."); + break; + } + + if (ret == -1) + err(1, "cgroup.event_control is not accessible any more"); + + printf("%s %s: crossed\n", argv[1], argv[2]); + } + + return 0; +} diff --git a/samples/cgroup/memcg_event_listener.c b/samples/cgroup/memcg_event_listener.c new file mode 100644 index 000000000000..41425edbd88a --- /dev/null +++ b/samples/cgroup/memcg_event_listener.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * memcg_event_listener.c - Simple listener of memcg memory.events + * + * Copyright (c) 2023, SaluteDevices. All Rights Reserved. + * + * Author: Dmitry Rokosov <ddrokosov@salutedevices.com> + */ + +#include <err.h> +#include <errno.h> +#include <limits.h> +#include <poll.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/inotify.h> +#include <unistd.h> + +/* Size of buffer to use when reading inotify events */ +#define INOTIFY_BUFFER_SIZE 8192 + +#define INOTIFY_EVENT_NEXT(event, length) ({ \ + (length) -= sizeof(*(event)) + (event)->len; \ + (event)++; \ +}) + +#define INOTIFY_EVENT_OK(event, length) ((length) >= (ssize_t)sizeof(*(event))) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +struct memcg_counters { + long low; + long high; + long max; + long oom; + long oom_kill; + long oom_group_kill; +}; + +struct memcg_events { + struct memcg_counters counters; + char path[PATH_MAX]; + int inotify_fd; + int inotify_wd; +}; + +static void print_memcg_counters(const struct memcg_counters *counters) +{ + printf("MEMCG events:\n"); + printf("\tlow: %ld\n", counters->low); + printf("\thigh: %ld\n", counters->high); + printf("\tmax: %ld\n", counters->max); + printf("\toom: %ld\n", counters->oom); + printf("\toom_kill: %ld\n", counters->oom_kill); + printf("\toom_group_kill: %ld\n", counters->oom_group_kill); +} + +static int get_memcg_counter(char *line, const char *name, long *counter) +{ + size_t len = strlen(name); + char *endptr; + long tmp; + + if (memcmp(line, name, len)) { + warnx("Counter line %s has wrong name, %s is expected", + line, name); + return -EINVAL; + } + + /* skip the whitespace delimiter */ + len += 1; + + errno = 0; + tmp = strtol(&line[len], &endptr, 10); + if (((tmp == LONG_MAX || tmp == LONG_MIN) && errno == ERANGE) || + (errno && !tmp)) { + warnx("Failed to parse: %s", &line[len]); + return -ERANGE; + } + + if (endptr == &line[len]) { + warnx("Not digits were found in line %s", &line[len]); + return -EINVAL; + } + + if (!(*endptr == '\0' || (*endptr == '\n' && *++endptr == '\0'))) { + warnx("Further characters after number: %s", endptr); + return -EINVAL; + } + + *counter = tmp; + + return 0; +} + +static int read_memcg_events(struct memcg_events *events, bool show_diff) +{ + FILE *fp = fopen(events->path, "re"); + size_t i; + int ret = 0; + bool any_new_events = false; + char *line = NULL; + size_t len = 0; + struct memcg_counters new_counters; + struct memcg_counters *counters = &events->counters; + struct { + const char *name; + long *new; + long *old; + } map[] = { + { + .name = "low", + .new = &new_counters.low, + .old = &counters->low, + }, + { + .name = "high", + .new = &new_counters.high, + .old = &counters->high, + }, + { + .name = "max", + .new = &new_counters.max, + .old = &counters->max, + }, + { + .name = "oom", + .new = &new_counters.oom, + .old = &counters->oom, + }, + { + .name = "oom_kill", + .new = &new_counters.oom_kill, + .old = &counters->oom_kill, + }, + { + .name = "oom_group_kill", + .new = &new_counters.oom_group_kill, + .old = &counters->oom_group_kill, + }, + }; + + if (!fp) { + warn("Failed to open memcg events file %s", events->path); + return -EBADF; + } + + /* Read new values for memcg counters */ + for (i = 0; i < ARRAY_SIZE(map); ++i) { + ssize_t nread; + + errno = 0; + nread = getline(&line, &len, fp); + if (nread == -1) { + if (errno) { + warn("Failed to read line for counter %s", + map[i].name); + ret = -EIO; + goto exit; + } + + break; + } + + ret = get_memcg_counter(line, map[i].name, map[i].new); + if (ret) { + warnx("Failed to get counter value from line %s", line); + goto exit; + } + } + + for (i = 0; i < ARRAY_SIZE(map); ++i) { + long diff; + + if (*map[i].new > *map[i].old) { + diff = *map[i].new - *map[i].old; + + if (show_diff) + printf("*** %ld MEMCG %s event%s, " + "change counter %ld => %ld\n", + diff, map[i].name, + (diff == 1) ? "" : "s", + *map[i].old, *map[i].new); + + *map[i].old += diff; + any_new_events = true; + } + } + + if (show_diff && !any_new_events) + printf("*** No new untracked memcg events available\n"); + +exit: + free(line); + fclose(fp); + + return ret; +} + +static void process_memcg_events(struct memcg_events *events, + struct inotify_event *event) +{ + int ret; + + if (events->inotify_wd != event->wd) { + warnx("Unknown inotify event %d, should be %d", event->wd, + events->inotify_wd); + return; + } + + printf("Received event in %s:\n", events->path); + + if (!(event->mask & IN_MODIFY)) { + warnx("No IN_MODIFY event, skip it"); + return; + } + + ret = read_memcg_events(events, /* show_diff = */true); + if (ret) + warnx("Can't read memcg events"); +} + +static void monitor_events(struct memcg_events *events) +{ + struct pollfd fds[1]; + int ret; + + printf("Started monitoring memory events from '%s'...\n", events->path); + + fds[0].fd = events->inotify_fd; + fds[0].events = POLLIN; + + for (;;) { + ret = poll(fds, ARRAY_SIZE(fds), -1); + if (ret < 0 && errno != EAGAIN) + err(EXIT_FAILURE, "Can't poll memcg events (%d)", ret); + + if (fds[0].revents & POLLERR) + err(EXIT_FAILURE, "Got POLLERR during monitor events"); + + if (fds[0].revents & POLLIN) { + struct inotify_event *event; + char buffer[INOTIFY_BUFFER_SIZE]; + ssize_t length; + + length = read(fds[0].fd, buffer, INOTIFY_BUFFER_SIZE); + if (length <= 0) + continue; + + event = (struct inotify_event *)buffer; + while (INOTIFY_EVENT_OK(event, length)) { + process_memcg_events(events, event); + event = INOTIFY_EVENT_NEXT(event, length); + } + } + } +} + +static int initialize_memcg_events(struct memcg_events *events, + const char *cgroup) +{ + int ret; + + memset(events, 0, sizeof(struct memcg_events)); + + ret = snprintf(events->path, PATH_MAX, + "/sys/fs/cgroup/%s/memory.events", cgroup); + if (ret >= PATH_MAX) { + warnx("Path to cgroup memory.events is too long"); + return -EMSGSIZE; + } else if (ret < 0) { + warn("Can't generate cgroup event full name"); + return ret; + } + + ret = read_memcg_events(events, /* show_diff = */false); + if (ret) { + warnx("Failed to read initial memcg events state (%d)", ret); + return ret; + } + + events->inotify_fd = inotify_init(); + if (events->inotify_fd < 0) { + warn("Failed to setup new inotify device"); + return -EMFILE; + } + + events->inotify_wd = inotify_add_watch(events->inotify_fd, + events->path, IN_MODIFY); + if (events->inotify_wd < 0) { + warn("Couldn't add monitor in dir %s", events->path); + return -EIO; + } + + printf("Initialized MEMCG events with counters:\n"); + print_memcg_counters(&events->counters); + + return 0; +} + +static void cleanup_memcg_events(struct memcg_events *events) +{ + inotify_rm_watch(events->inotify_fd, events->inotify_wd); + close(events->inotify_fd); +} + +int main(int argc, const char **argv) +{ + struct memcg_events events; + ssize_t ret; + + if (argc != 2) + errx(EXIT_FAILURE, "Usage: %s <cgroup>", argv[0]); + + ret = initialize_memcg_events(&events, argv[1]); + if (ret) + errx(EXIT_FAILURE, "Can't initialize memcg events (%zd)", ret); + + monitor_events(&events); + + cleanup_memcg_events(&events); + + printf("Exiting memcg event listener...\n"); + + return EXIT_SUCCESS; +} diff --git a/samples/check-exec/.gitignore b/samples/check-exec/.gitignore new file mode 100644 index 000000000000..cd759a19dacd --- /dev/null +++ b/samples/check-exec/.gitignore @@ -0,0 +1,2 @@ +/inc +/set-exec diff --git a/samples/check-exec/Makefile b/samples/check-exec/Makefile new file mode 100644 index 000000000000..c4f08ad0f8e3 --- /dev/null +++ b/samples/check-exec/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: BSD-3-Clause + +userprogs-always-y := \ + inc \ + set-exec + +userccflags += -I usr/include + +.PHONY: all clean + +all: + $(MAKE) -C ../.. samples/check-exec/ + +clean: + $(MAKE) -C ../.. M=samples/check-exec/ clean diff --git a/samples/check-exec/inc.c b/samples/check-exec/inc.c new file mode 100644 index 000000000000..7f6ef06a2f06 --- /dev/null +++ b/samples/check-exec/inc.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Very simple script interpreter that can evaluate two different commands (one + * per line): + * - "?" to initialize a counter from user's input; + * - "+" to increment the counter (which is set to 0 by default). + * + * See tools/testing/selftests/exec/check-exec-tests.sh and + * Documentation/userspace-api/check_exec.rst + * + * Copyright © 2024 Microsoft Corporation + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <linux/fcntl.h> +#include <linux/prctl.h> +#include <linux/securebits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <sys/syscall.h> +#include <unistd.h> + +static int sys_execveat(int dirfd, const char *pathname, char *const argv[], + char *const envp[], int flags) +{ + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +} + +/* Returns 1 on error, 0 otherwise. */ +static int interpret_buffer(char *buffer, size_t buffer_size) +{ + char *line, *saveptr = NULL; + long long number = 0; + + /* Each command is the first character of a line. */ + saveptr = NULL; + line = strtok_r(buffer, "\n", &saveptr); + while (line) { + if (*line != '#' && strlen(line) != 1) { + fprintf(stderr, "# ERROR: Unknown string\n"); + return 1; + } + switch (*line) { + case '#': + /* Skips shebang and comments. */ + break; + case '+': + /* Increments and prints the number. */ + number++; + printf("%lld\n", number); + break; + case '?': + /* Reads integer from stdin. */ + fprintf(stderr, "> Enter new number: \n"); + if (scanf("%lld", &number) != 1) { + fprintf(stderr, + "# WARNING: Failed to read number from stdin\n"); + } + break; + default: + fprintf(stderr, "# ERROR: Unknown character '%c'\n", + *line); + return 1; + } + line = strtok_r(NULL, "\n", &saveptr); + } + return 0; +} + +/* Returns 1 on error, 0 otherwise. */ +static int interpret_stream(FILE *script, char *const script_name, + char *const *const envp, const bool restrict_stream) +{ + int err; + char *const script_argv[] = { script_name, NULL }; + char buf[128] = {}; + size_t buf_size = sizeof(buf); + + /* + * We pass a valid argv and envp to the kernel to emulate a native + * script execution. We must use the script file descriptor instead of + * the script path name to avoid race conditions. + */ + err = sys_execveat(fileno(script), "", script_argv, envp, + AT_EMPTY_PATH | AT_EXECVE_CHECK); + if (err && restrict_stream) { + perror("ERROR: Script execution check"); + return 1; + } + + /* Reads script. */ + buf_size = fread(buf, 1, buf_size - 1, script); + return interpret_buffer(buf, buf_size); +} + +static void print_usage(const char *argv0) +{ + fprintf(stderr, "usage: %s <script.inc> | -i | -c <command>\n\n", + argv0); + fprintf(stderr, "Example:\n"); + fprintf(stderr, " ./set-exec -fi -- ./inc -i < script-exec.inc\n"); +} + +int main(const int argc, char *const argv[], char *const *const envp) +{ + int opt; + char *cmd = NULL; + char *script_name = NULL; + bool interpret_stdin = false; + FILE *script_file = NULL; + int secbits; + bool deny_interactive, restrict_file; + size_t arg_nb; + + secbits = prctl(PR_GET_SECUREBITS); + if (secbits == -1) { + /* + * This should never happen, except with a buggy seccomp + * filter. + */ + perror("ERROR: Failed to get securebits"); + return 1; + } + + deny_interactive = !!(secbits & SECBIT_EXEC_DENY_INTERACTIVE); + restrict_file = !!(secbits & SECBIT_EXEC_RESTRICT_FILE); + + while ((opt = getopt(argc, argv, "c:i")) != -1) { + switch (opt) { + case 'c': + if (cmd) { + fprintf(stderr, "ERROR: Command already set"); + return 1; + } + cmd = optarg; + break; + case 'i': + interpret_stdin = true; + break; + default: + print_usage(argv[0]); + return 1; + } + } + + /* Checks that only one argument is used, or read stdin. */ + arg_nb = !!cmd + !!interpret_stdin; + if (arg_nb == 0 && argc == 2) { + script_name = argv[1]; + } else if (arg_nb != 1) { + print_usage(argv[0]); + return 1; + } + + if (cmd) { + /* + * Other kind of interactive interpretations should be denied + * as well (e.g. CLI arguments passing script snippets, + * environment variables interpreted as script). However, any + * way to pass script files should only be restricted according + * to restrict_file. + */ + if (deny_interactive) { + fprintf(stderr, + "ERROR: Interactive interpretation denied.\n"); + return 1; + } + + return interpret_buffer(cmd, strlen(cmd)); + } + + if (interpret_stdin && !script_name) { + script_file = stdin; + /* + * As for any execve(2) call, this path may be logged by the + * kernel. + */ + script_name = "/proc/self/fd/0"; + /* + * When stdin is used, it can point to a regular file or a + * pipe. Restrict stdin execution according to + * SECBIT_EXEC_DENY_INTERACTIVE but always allow executable + * files (which are not considered as interactive inputs). + */ + return interpret_stream(script_file, script_name, envp, + deny_interactive); + } else if (script_name && !interpret_stdin) { + /* + * In this sample, we don't pass any argument to scripts, but + * otherwise we would have to forge an argv with such + * arguments. + */ + script_file = fopen(script_name, "r"); + if (!script_file) { + perror("ERROR: Failed to open script"); + return 1; + } + /* + * Restricts file execution according to + * SECBIT_EXEC_RESTRICT_FILE. + */ + return interpret_stream(script_file, script_name, envp, + restrict_file); + } + + print_usage(argv[0]); + return 1; +} diff --git a/samples/check-exec/run-script-ask.sh b/samples/check-exec/run-script-ask.sh new file mode 100755 index 000000000000..8ef0fdc37266 --- /dev/null +++ b/samples/check-exec/run-script-ask.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env sh +# SPDX-License-Identifier: BSD-3-Clause + +DIR="$(dirname -- "$0")" + +PATH="${PATH}:${DIR}" + +set -x +"${DIR}/script-ask.inc" diff --git a/samples/check-exec/script-ask.inc b/samples/check-exec/script-ask.inc new file mode 100755 index 000000000000..720a8e649225 --- /dev/null +++ b/samples/check-exec/script-ask.inc @@ -0,0 +1,5 @@ +#!/usr/bin/env inc +# SPDX-License-Identifier: BSD-3-Clause + +? ++ diff --git a/samples/check-exec/script-exec.inc b/samples/check-exec/script-exec.inc new file mode 100755 index 000000000000..3245cb9d8dd1 --- /dev/null +++ b/samples/check-exec/script-exec.inc @@ -0,0 +1,4 @@ +#!/usr/bin/env inc +# SPDX-License-Identifier: BSD-3-Clause + ++ diff --git a/samples/check-exec/script-noexec.inc b/samples/check-exec/script-noexec.inc new file mode 100644 index 000000000000..3245cb9d8dd1 --- /dev/null +++ b/samples/check-exec/script-noexec.inc @@ -0,0 +1,4 @@ +#!/usr/bin/env inc +# SPDX-License-Identifier: BSD-3-Clause + ++ diff --git a/samples/check-exec/set-exec.c b/samples/check-exec/set-exec.c new file mode 100644 index 000000000000..ba86a60a20dd --- /dev/null +++ b/samples/check-exec/set-exec.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Simple tool to set SECBIT_EXEC_RESTRICT_FILE, SECBIT_EXEC_DENY_INTERACTIVE, + * before executing a command. + * + * Copyright © 2024 Microsoft Corporation + */ + +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <errno.h> +#include <linux/prctl.h> +#include <linux/securebits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <unistd.h> + +static void print_usage(const char *argv0) +{ + fprintf(stderr, "usage: %s -f|-i -- <cmd> [args]...\n\n", argv0); + fprintf(stderr, "Execute a command with\n"); + fprintf(stderr, "- SECBIT_EXEC_RESTRICT_FILE set: -f\n"); + fprintf(stderr, "- SECBIT_EXEC_DENY_INTERACTIVE set: -i\n"); +} + +int main(const int argc, char *const argv[], char *const *const envp) +{ + const char *cmd_path; + char *const *cmd_argv; + int opt, secbits_cur, secbits_new; + bool has_policy = false; + + secbits_cur = prctl(PR_GET_SECUREBITS); + if (secbits_cur == -1) { + /* + * This should never happen, except with a buggy seccomp + * filter. + */ + perror("ERROR: Failed to get securebits"); + return 1; + } + + secbits_new = secbits_cur; + while ((opt = getopt(argc, argv, "fi")) != -1) { + switch (opt) { + case 'f': + secbits_new |= SECBIT_EXEC_RESTRICT_FILE | + SECBIT_EXEC_RESTRICT_FILE_LOCKED; + has_policy = true; + break; + case 'i': + secbits_new |= SECBIT_EXEC_DENY_INTERACTIVE | + SECBIT_EXEC_DENY_INTERACTIVE_LOCKED; + has_policy = true; + break; + default: + print_usage(argv[0]); + return 1; + } + } + + if (!argv[optind] || !has_policy) { + print_usage(argv[0]); + return 1; + } + + if (secbits_cur != secbits_new && + prctl(PR_SET_SECUREBITS, secbits_new)) { + perror("Failed to set secure bit(s)."); + fprintf(stderr, + "Hint: The running kernel may not support this feature.\n"); + return 1; + } + + cmd_path = argv[optind]; + cmd_argv = argv + optind; + fprintf(stderr, "Executing command...\n"); + execvpe(cmd_path, cmd_argv, envp); + fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path, + strerror(errno)); + return 1; +} diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile index a9afd99630fc..92d661fcba6d 100644 --- a/samples/configfs/Makefile +++ b/samples/configfs/Makefile @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c index 1ea33119e532..fd5d163828c5 100644 --- a/samples/configfs/configfs_sample.c +++ b/samples/configfs/configfs_sample.c @@ -1,39 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * vim: noexpandtab ts=8 sts=0 sw=8: - * * configfs_example_macros.c - This file is a demonstration module * containing a number of configfs subsystems. It uses the helper * macros defined by configfs.h * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * Based on sysfs: - * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #include <linux/init.h> +#include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> - #include <linux/configfs.h> - - /* * 01-childless * @@ -54,8 +36,8 @@ struct childless { static inline struct childless *to_childless(struct config_item *item) { - return item ? container_of(to_configfs_subsystem(to_config_group(item)), - struct childless, subsys) : NULL; + return container_of(to_configfs_subsystem(to_config_group(item)), + struct childless, subsys); } static ssize_t childless_showme_show(struct config_item *item, char *page) @@ -78,17 +60,11 @@ static ssize_t childless_storeme_store(struct config_item *item, const char *page, size_t count) { struct childless *childless = to_childless(item); - unsigned long tmp; - char *p = (char *) page; - - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; - - if (tmp > INT_MAX) - return -ERANGE; + int ret; - childless->storeme = tmp; + ret = kstrtoint(page, 10, &childless->storeme); + if (ret) + return ret; return count; } @@ -115,7 +91,7 @@ static struct configfs_attribute *childless_attrs[] = { NULL, }; -static struct config_item_type childless_type = { +static const struct config_item_type childless_type = { .ct_attrs = childless_attrs, .ct_owner = THIS_MODULE, }; @@ -131,7 +107,6 @@ static struct childless childless_subsys = { }, }; - /* ----------------------------------------------------------------- */ /* @@ -150,7 +125,7 @@ struct simple_child { static inline struct simple_child *to_simple_child(struct config_item *item) { - return item ? container_of(item, struct simple_child, item) : NULL; + return container_of(item, struct simple_child, item); } static ssize_t simple_child_storeme_show(struct config_item *item, char *page) @@ -162,17 +137,11 @@ static ssize_t simple_child_storeme_store(struct config_item *item, const char *page, size_t count) { struct simple_child *simple_child = to_simple_child(item); - unsigned long tmp; - char *p = (char *) page; - - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; - - if (tmp > INT_MAX) - return -ERANGE; + int ret; - simple_child->storeme = tmp; + ret = kstrtoint(page, 10, &simple_child->storeme); + if (ret) + return ret; return count; } @@ -190,24 +159,23 @@ static void simple_child_release(struct config_item *item) } static struct configfs_item_operations simple_child_item_ops = { - .release = simple_child_release, + .release = simple_child_release, }; -static struct config_item_type simple_child_type = { +static const struct config_item_type simple_child_type = { .ct_item_ops = &simple_child_item_ops, .ct_attrs = simple_child_attrs, .ct_owner = THIS_MODULE, }; - struct simple_children { struct config_group group; }; static inline struct simple_children *to_simple_children(struct config_item *item) { - return item ? container_of(to_config_group(item), - struct simple_children, group) : NULL; + return container_of(to_config_group(item), + struct simple_children, group); } static struct config_item *simple_children_make_item(struct config_group *group, @@ -222,8 +190,6 @@ static struct config_item *simple_children_make_item(struct config_group *group, config_item_init_type_name(&simple_child->item, name, &simple_child_type); - simple_child->storeme = 0; - return &simple_child->item; } @@ -261,7 +227,7 @@ static struct configfs_group_operations simple_children_group_ops = { .make_item = simple_children_make_item, }; -static struct config_item_type simple_children_type = { +static const struct config_item_type simple_children_type = { .ct_item_ops = &simple_children_item_ops, .ct_group_ops = &simple_children_group_ops, .ct_attrs = simple_children_attrs, @@ -277,7 +243,6 @@ static struct configfs_subsystem simple_children_subsys = { }, }; - /* ----------------------------------------------------------------- */ /* @@ -331,7 +296,7 @@ static struct configfs_group_operations group_children_group_ops = { .make_group = group_children_make_group, }; -static struct config_item_type group_children_type = { +static const struct config_item_type group_children_type = { .ct_group_ops = &group_children_group_ops, .ct_attrs = group_children_attrs, .ct_owner = THIS_MODULE, @@ -364,9 +329,8 @@ static struct configfs_subsystem *example_subsys[] = { static int __init configfs_example_init(void) { - int ret; - int i; struct configfs_subsystem *subsys; + int ret, i; for (i = 0; example_subsys[i]; i++) { subsys = example_subsys[i]; @@ -375,9 +339,8 @@ static int __init configfs_example_init(void) mutex_init(&subsys->su_mutex); ret = configfs_register_subsystem(subsys); if (ret) { - printk(KERN_ERR "Error %d while registering subsystem %s\n", - ret, - subsys->su_group.cg_item.ci_namebuf); + pr_err("Error %d while registering subsystem %s\n", + ret, subsys->su_group.cg_item.ci_namebuf); goto out_unregister; } } @@ -401,4 +364,5 @@ static void __exit configfs_example_exit(void) module_init(configfs_example_init); module_exit(configfs_example_exit); +MODULE_DESCRIPTION("Sample configfs module"); MODULE_LICENSE("GPL"); diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore index d2b9c32accd4..0e26039f39b5 100644 --- a/samples/connector/.gitignore +++ b/samples/connector/.gitignore @@ -1 +1,2 @@ -ucon +# SPDX-License-Identifier: GPL-2.0-only +/ucon diff --git a/samples/connector/Makefile b/samples/connector/Makefile index 91762d946a53..d98a9e047c11 100644 --- a/samples/connector/Makefile +++ b/samples/connector/Makefile @@ -1,16 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o -# List of programs to build -ifdef CONFIG_SAMPLE_CONNECTOR -hostprogs-y := ucon -endif +userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include - -all: modules - -modules clean: - $(MAKE) -C ../.. SUBDIRS=$(CURDIR) $@ +userccflags += -I usr/include diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c index d12cc944b696..73d50b4aebb6 100644 --- a/samples/connector/cn_test.c +++ b/samples/connector/cn_test.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * cn_test.c * * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "cn_test: " fmt @@ -125,12 +112,12 @@ static int cn_test_want_notify(void) #endif static u32 cn_test_timer_counter; -static void cn_test_timer_func(unsigned long __data) +static void cn_test_timer_func(struct timer_list *unused) { struct cn_msg *m; char data[32]; - pr_debug("%s: timer fired with data %lu\n", __func__, __data); + pr_debug("%s: timer fired\n", __func__); m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); if (m) { @@ -168,7 +155,7 @@ static int cn_test_init(void) goto err_out; } - setup_timer(&cn_test_timer, cn_test_timer_func, 0); + timer_setup(&cn_test_timer, cn_test_timer_func, 0); mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); pr_info("initialized with id={%u.%u}\n", @@ -185,7 +172,7 @@ static int cn_test_init(void) static void cn_test_fini(void) { - del_timer_sync(&cn_test_timer); + timer_delete_sync(&cn_test_timer); cn_del_callback(&cn_test_id); cn_test_id.val--; cn_del_callback(&cn_test_id); diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c index 8a4da64e02a8..fa17f864200e 100644 --- a/samples/connector/ucon.c +++ b/samples/connector/ucon.c @@ -1,22 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * ucon.c * * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <asm/types.h> diff --git a/samples/coresight/Makefile b/samples/coresight/Makefile new file mode 100644 index 000000000000..b3fce4af2347 --- /dev/null +++ b/samples/coresight/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight-cfg-sample.o +ccflags-y += -I$(srctree)/drivers/hwtracing/coresight diff --git a/samples/coresight/coresight-cfg-sample.c b/samples/coresight/coresight-cfg-sample.c new file mode 100644 index 000000000000..25485c80b5e3 --- /dev/null +++ b/samples/coresight/coresight-cfg-sample.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(C) 2020 Linaro Limited. All rights reserved. + * Author: Mike Leach <mike.leach@linaro.org> + */ + +#include "coresight-config.h" +#include "coresight-syscfg.h" + +/* create an alternate autofdo configuration */ + +/* we will provide 4 sets of preset parameter values */ +#define AFDO2_NR_PRESETS 4 +/* the total number of parameters in used features - strobing has 2 */ +#define AFDO2_NR_PARAM_SUM 2 + +static const char *afdo2_ref_names[] = { + "strobing", +}; + +/* + * set of presets leaves strobing window constant while varying period to allow + * experimentation with mark / space ratios for various workloads + */ +static u64 afdo2_presets[AFDO2_NR_PRESETS][AFDO2_NR_PARAM_SUM] = { + { 1000, 100 }, + { 1000, 1000 }, + { 1000, 5000 }, + { 1000, 10000 }, +}; + +struct cscfg_config_desc afdo2 = { + .name = "autofdo2", + .description = "Setup ETMs with strobing for autofdo\n" + "Supplied presets allow experimentation with mark-space ratio for various loads\n", + .nr_feat_refs = ARRAY_SIZE(afdo2_ref_names), + .feat_ref_names = afdo2_ref_names, + .nr_presets = AFDO2_NR_PRESETS, + .nr_total_params = AFDO2_NR_PARAM_SUM, + .presets = &afdo2_presets[0][0], +}; + +static struct cscfg_feature_desc *sample_feats[] = { + NULL +}; + +static struct cscfg_config_desc *sample_cfgs[] = { + &afdo2, + NULL +}; + +static struct cscfg_load_owner_info mod_owner = { + .type = CSCFG_OWNER_MODULE, + .owner_handle = THIS_MODULE, +}; + +/* module init and exit - just load and unload configs */ +static int __init cscfg_sample_init(void) +{ + return cscfg_load_config_sets(sample_cfgs, sample_feats, &mod_owner); +} + +static void __exit cscfg_sample_exit(void) +{ + cscfg_unload_config_sets(&mod_owner); +} + +module_init(cscfg_sample_init); +module_exit(cscfg_sample_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Mike Leach <mike.leach@linaro.org>"); +MODULE_DESCRIPTION("CoreSight Syscfg Example"); diff --git a/samples/damon/Kconfig b/samples/damon/Kconfig new file mode 100644 index 000000000000..cbf96fd8a8bf --- /dev/null +++ b/samples/damon/Kconfig @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "DAMON Samples" + +config SAMPLE_DAMON_WSSE + bool "DAMON sample module for working set size estimation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for working set size estimation. + + The module receives a pid, monitor access to the virtual address + space of the process, estimate working set size of the process, and + repeatedly prints the size on the kernel log. + + If unsure, say N. + +config SAMPLE_DAMON_PRCL + bool "DAMON sample module for access-aware proactive reclamation" + depends on DAMON && DAMON_VADDR + help + This builds DAMON sample module for access-aware proactive + reclamation. + + The module receives a pid, monitor access to the virtual address + space of the process, find memory regions that not accessed, and + proactively reclaim the regions. + + If unsure, say N. + +config SAMPLE_DAMON_MTIER + bool "DAMON sample module for memory tiering" + depends on DAMON && DAMON_PADDR + help + Thps builds DAMON sample module for memory tierign. + + The module assumes the system is constructed with two NUMA nodes, + which seems as local and remote nodes to all CPUs. For example, + node0 is for DDR5 DRAMs connected via DIMM, while node1 is for DDR4 + DRAMs connected via CXL. + + If unsure, say N. + +endmenu diff --git a/samples/damon/Makefile b/samples/damon/Makefile new file mode 100644 index 000000000000..72f68cbf422a --- /dev/null +++ b/samples/damon/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SAMPLE_DAMON_WSSE) += wsse.o +obj-$(CONFIG_SAMPLE_DAMON_PRCL) += prcl.o +obj-$(CONFIG_SAMPLE_DAMON_MTIER) += mtier.o diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c new file mode 100644 index 000000000000..775838a23d93 --- /dev/null +++ b/samples/damon/mtier.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * memory tiering: migrate cold pages in node 0 and hot pages in node 1 to node + * 1 and node 0, respectively. Adjust the hotness/coldness threshold aiming + * resulting 99.6 % node 0 utilization ratio. + */ + +#define pr_fmt(fmt) "damon_sample_mtier: " fmt + +#include <linux/damon.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_sample_mtier." + +static unsigned long node0_start_addr __read_mostly; +module_param(node0_start_addr, ulong, 0600); + +static unsigned long node0_end_addr __read_mostly; +module_param(node0_end_addr, ulong, 0600); + +static unsigned long node1_start_addr __read_mostly; +module_param(node1_start_addr, ulong, 0600); + +static unsigned long node1_end_addr __read_mostly; +module_param(node1_end_addr, ulong, 0600); + +static unsigned long node0_mem_used_bp __read_mostly = 9970; +module_param(node0_mem_used_bp, ulong, 0600); + +static unsigned long node0_mem_free_bp __read_mostly = 50; +module_param(node0_mem_free_bp, ulong, 0600); + +static int damon_sample_mtier_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_sample_mtier_enable_store, + .get = param_get_bool, +}; + +static bool enabled __read_mostly; +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_MTIER"); + +static bool detect_node_addresses __read_mostly; +module_param(detect_node_addresses, bool, 0600); + +static struct damon_ctx *ctxs[2]; + +struct region_range { + phys_addr_t start; + phys_addr_t end; +}; + +static int nid_to_phys(int target_node, struct region_range *range) +{ + if (!node_online(target_node)) { + pr_err("NUMA node %d is not online\n", target_node); + return -EINVAL; + } + + range->start = PFN_PHYS(node_start_pfn(target_node)); + range->end = PFN_PHYS(node_end_pfn(target_node)); + + return 0; +} + +static struct damon_ctx *damon_sample_mtier_build_ctx(bool promote) +{ + struct damon_ctx *ctx; + struct damon_attrs attrs; + struct damon_target *target; + struct damon_region *region; + struct damos *scheme; + struct damos_quota_goal *quota_goal; + struct damos_filter *filter; + struct region_range addr; + int ret; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + attrs = (struct damon_attrs) { + .sample_interval = 5 * USEC_PER_MSEC, + .aggr_interval = 100 * USEC_PER_MSEC, + .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC, + .min_nr_regions = 10, + .max_nr_regions = 1000, + }; + + /* + * auto-tune sampling and aggregation interval aiming 4% DAMON-observed + * accesses ratio, keeping sampling interval in [5ms, 10s] range. + */ + attrs.intervals_goal = (struct damon_intervals_goal) { + .access_bp = 400, .aggrs = 3, + .min_sample_us = 5000, .max_sample_us = 10000000, + }; + if (damon_set_attrs(ctx, &attrs)) + goto free_out; + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + goto free_out; + + target = damon_new_target(); + if (!target) + goto free_out; + damon_add_target(ctx, target); + + if (detect_node_addresses) { + ret = promote ? nid_to_phys(1, &addr) : nid_to_phys(0, &addr); + if (ret) + goto free_out; + } else { + addr.start = promote ? node1_start_addr : node0_start_addr; + addr.end = promote ? node1_end_addr : node0_end_addr; + } + + region = damon_new_region(addr.start, addr.end); + if (!region) + goto free_out; + damon_add_region(region, target); + + scheme = damon_new_scheme( + /* access pattern */ + &(struct damos_access_pattern) { + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + .min_nr_accesses = promote ? 1 : 0, + .max_nr_accesses = promote ? UINT_MAX : 0, + .min_age_region = 0, + .max_age_region = UINT_MAX}, + /* action */ + promote ? DAMOS_MIGRATE_HOT : DAMOS_MIGRATE_COLD, + 1000000, /* apply interval (1s) */ + &(struct damos_quota){ + /* 200 MiB per sec by most */ + .reset_interval = 1000, + .sz = 200 * 1024 * 1024, + /* ignore size of region when prioritizing */ + .weight_sz = 0, + .weight_nr_accesses = 100, + .weight_age = 100, + }, + &(struct damos_watermarks){}, + promote ? 0 : 1); /* migrate target node id */ + if (!scheme) + goto free_out; + damon_set_schemes(ctx, &scheme, 1); + quota_goal = damos_new_quota_goal( + promote ? DAMOS_QUOTA_NODE_MEM_USED_BP : + DAMOS_QUOTA_NODE_MEM_FREE_BP, + promote ? node0_mem_used_bp : node0_mem_free_bp); + if (!quota_goal) + goto free_out; + quota_goal->nid = 0; + damos_add_quota_goal(&scheme->quota, quota_goal); + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true, promote); + if (!filter) + goto free_out; + damos_add_filter(scheme, filter); + return ctx; +free_out: + damon_destroy_ctx(ctx); + return NULL; +} + +static int damon_sample_mtier_start(void) +{ + struct damon_ctx *ctx; + + ctx = damon_sample_mtier_build_ctx(true); + if (!ctx) + return -ENOMEM; + ctxs[0] = ctx; + ctx = damon_sample_mtier_build_ctx(false); + if (!ctx) { + damon_destroy_ctx(ctxs[0]); + return -ENOMEM; + } + ctxs[1] = ctx; + return damon_start(ctxs, 2, true); +} + +static void damon_sample_mtier_stop(void) +{ + damon_stop(ctxs, 2); + damon_destroy_ctx(ctxs[0]); + damon_destroy_ctx(ctxs[1]); +} + +static int damon_sample_mtier_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool is_enabled = enabled; + int err; + + err = kstrtobool(val, &enabled); + if (err) + return err; + + if (enabled == is_enabled) + return 0; + + if (!damon_initialized()) + return 0; + + if (enabled) { + err = damon_sample_mtier_start(); + if (err) + enabled = false; + return err; + } + damon_sample_mtier_stop(); + return 0; +} + +static int __init damon_sample_mtier_init(void) +{ + int err = 0; + + if (!damon_initialized()) { + if (enabled) + enabled = false; + return -ENOMEM; + } + + if (enabled) { + err = damon_sample_mtier_start(); + if (err) + enabled = false; + } + return 0; +} + +module_init(damon_sample_mtier_init); diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c new file mode 100644 index 000000000000..b7c50f2656ce --- /dev/null +++ b/samples/damon/prcl.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proactive reclamation: monitor access pattern of a given process, find + * regions that seems not accessed, and proactively page out the regions. + */ + +#define pr_fmt(fmt) "damon_sample_prcl: " fmt + +#include <linux/damon.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_sample_prcl." + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_sample_prcl_enable_store, + .get = param_get_bool, +}; + +static bool enabled __read_mostly; +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_PRCL"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_prcl_repeat_call_fn(void *data) +{ + struct damon_ctx *c = data; + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static struct damon_call_control repeat_call_control = { + .fn = damon_sample_prcl_repeat_call_fn, + .repeat = true, +}; + +static int damon_sample_prcl_start(void) +{ + struct damon_target *target; + struct damos *scheme; + int err; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + scheme = damon_new_scheme( + &(struct damos_access_pattern) { + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + .min_nr_accesses = 0, + .max_nr_accesses = 0, + .min_age_region = 50, + .max_age_region = UINT_MAX}, + DAMOS_PAGEOUT, + 0, + &(struct damos_quota){}, + &(struct damos_watermarks){}, + NUMA_NO_NODE); + if (!scheme) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_set_schemes(ctx, &scheme, 1); + + err = damon_start(&ctx, 1, true); + if (err) + return err; + + repeat_call_control.data = ctx; + return damon_call(ctx, &repeat_call_control); +} + +static void damon_sample_prcl_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } +} + +static int damon_sample_prcl_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool is_enabled = enabled; + int err; + + err = kstrtobool(val, &enabled); + if (err) + return err; + + if (enabled == is_enabled) + return 0; + + if (!damon_initialized()) + return 0; + + if (enabled) { + err = damon_sample_prcl_start(); + if (err) + enabled = false; + return err; + } + damon_sample_prcl_stop(); + return 0; +} + +static int __init damon_sample_prcl_init(void) +{ + int err = 0; + + if (!damon_initialized()) { + if (enabled) + enabled = false; + return -ENOMEM; + } + + if (enabled) { + err = damon_sample_prcl_start(); + if (err) + enabled = false; + } + return 0; +} + +module_init(damon_sample_prcl_init); diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c new file mode 100644 index 000000000000..799ad4443943 --- /dev/null +++ b/samples/damon/wsse.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * working set size estimation: monitor access pattern of given process and + * print estimated working set size (total size of regions that showing some + * access). + */ + +#define pr_fmt(fmt) "damon_sample_wsse: " fmt + +#include <linux/damon.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_sample_wsse." + +static int target_pid __read_mostly; +module_param(target_pid, int, 0600); + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp); + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_sample_wsse_enable_store, + .get = param_get_bool, +}; + +static bool enabled __read_mostly; +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_WSSE"); + +static struct damon_ctx *ctx; +static struct pid *target_pidp; + +static int damon_sample_wsse_repeat_call_fn(void *data) +{ + struct damon_ctx *c = data; + struct damon_target *t; + + damon_for_each_target(t, c) { + struct damon_region *r; + unsigned long wss = 0; + + damon_for_each_region(r, t) { + if (r->nr_accesses > 0) + wss += r->ar.end - r->ar.start; + } + pr_info("wss: %lu\n", wss); + } + return 0; +} + +static struct damon_call_control repeat_call_control = { + .fn = damon_sample_wsse_repeat_call_fn, + .repeat = true, +}; + +static int damon_sample_wsse_start(void) +{ + struct damon_target *target; + int err; + + pr_info("start\n"); + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + if (damon_select_ops(ctx, DAMON_OPS_VADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + target_pidp = find_get_pid(target_pid); + if (!target_pidp) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + target->pid = target_pidp; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + repeat_call_control.data = ctx; + return damon_call(ctx, &repeat_call_control); +} + +static void damon_sample_wsse_stop(void) +{ + pr_info("stop\n"); + if (ctx) { + damon_stop(&ctx, 1); + damon_destroy_ctx(ctx); + } +} + +static int damon_sample_wsse_enable_store( + const char *val, const struct kernel_param *kp) +{ + bool is_enabled = enabled; + int err; + + err = kstrtobool(val, &enabled); + if (err) + return err; + + if (enabled == is_enabled) + return 0; + + if (!damon_initialized()) + return 0; + + if (enabled) { + err = damon_sample_wsse_start(); + if (err) + enabled = false; + return err; + } + damon_sample_wsse_stop(); + return 0; +} + +static int __init damon_sample_wsse_init(void) +{ + int err = 0; + + if (!damon_initialized()) { + err = -ENOMEM; + if (enabled) + enabled = false; + } + + if (enabled) { + err = damon_sample_wsse_start(); + if (err) + enabled = false; + } + return err; +} + +module_init(damon_sample_wsse_init); diff --git a/samples/fanotify/.gitignore b/samples/fanotify/.gitignore new file mode 100644 index 000000000000..d74593e8b2de --- /dev/null +++ b/samples/fanotify/.gitignore @@ -0,0 +1 @@ +fs-monitor diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile new file mode 100644 index 000000000000..e20db1bdde3b --- /dev/null +++ b/samples/fanotify/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += fs-monitor + +userccflags += -I usr/include -Wall + diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c new file mode 100644 index 000000000000..28c0a652ffeb --- /dev/null +++ b/samples/fanotify/fs-monitor.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021, Collabora Ltd. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <err.h> +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> +#include <sys/fanotify.h> +#include <sys/types.h> +#include <unistd.h> +#ifndef __GLIBC__ +#include <asm-generic/int-ll64.h> +#endif + +#ifndef FAN_FS_ERROR +#define FAN_FS_ERROR 0x00008000 +#define FAN_EVENT_INFO_TYPE_ERROR 5 + +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; +#endif + +#ifndef FILEID_INO32_GEN +#define FILEID_INO32_GEN 1 +#endif + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif + +static void print_fh(struct file_handle *fh) +{ + int i; + uint32_t *h = (uint32_t *) fh->f_handle; + + printf("\tfh: "); + for (i = 0; i < fh->handle_bytes; i++) + printf("%hhx", fh->f_handle[i]); + printf("\n"); + + printf("\tdecoded fh: "); + if (fh->handle_type == FILEID_INO32_GEN) + printf("inode=%u gen=%u\n", h[0], h[1]); + else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes) + printf("Type %d (Superblock error)\n", fh->handle_type); + else + printf("Type %d (Unknown)\n", fh->handle_type); + +} + +static void handle_notifications(char *buffer, int len) +{ + struct fanotify_event_metadata *event = + (struct fanotify_event_metadata *) buffer; + struct fanotify_event_info_header *info; + struct fanotify_event_info_error *err; + struct fanotify_event_info_fid *fid; + int off; + + for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) { + + if (event->mask != FAN_FS_ERROR) { + printf("unexpected FAN MARK: %llx\n", + (unsigned long long)event->mask); + goto next_event; + } + + if (event->fd != FAN_NOFD) { + printf("Unexpected fd (!= FAN_NOFD)\n"); + goto next_event; + } + + printf("FAN_FS_ERROR (len=%d)\n", event->event_len); + + for (off = sizeof(*event) ; off < event->event_len; + off += info->len) { + info = (struct fanotify_event_info_header *) + ((char *) event + off); + + switch (info->info_type) { + case FAN_EVENT_INFO_TYPE_ERROR: + err = (struct fanotify_event_info_error *) info; + + printf("\tGeneric Error Record: len=%d\n", + err->hdr.len); + printf("\terror: %d\n", err->error); + printf("\terror_count: %d\n", err->error_count); + break; + + case FAN_EVENT_INFO_TYPE_FID: + fid = (struct fanotify_event_info_fid *) info; + + printf("\tfsid: %x%x\n", +#if defined(__GLIBC__) + fid->fsid.val[0], fid->fsid.val[1]); +#else + fid->fsid.__val[0], fid->fsid.__val[1]); +#endif + print_fh((struct file_handle *) &fid->handle); + break; + + default: + printf("\tUnknown info type=%d len=%d:\n", + info->info_type, info->len); + } + } +next_event: + printf("---\n\n"); + } +} + +int main(int argc, char **argv) +{ + int fd; + + char buffer[BUFSIZ]; + + if (argc < 2) { + printf("Missing path argument\n"); + return 1; + } + + fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY); + if (fd < 0) + errx(1, "fanotify_init"); + + if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, + FAN_FS_ERROR, AT_FDCWD, argv[1])) { + errx(1, "fanotify_mark"); + } + + while (1) { + int n = read(fd, buffer, BUFSIZ); + + if (n < 0) + errx(1, "read"); + + handle_notifications(buffer, n); + } + + return 0; +} diff --git a/samples/fprobe/Makefile b/samples/fprobe/Makefile new file mode 100644 index 000000000000..ecccbfa6e99b --- /dev/null +++ b/samples/fprobe/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_FPROBE) += fprobe_example.o diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c new file mode 100644 index 000000000000..bfe98ce826f3 --- /dev/null +++ b/samples/fprobe/fprobe_example.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Here's a sample kernel module showing the use of fprobe to dump a + * stack trace and selected registers when kernel_clone() is called. + * + * For more information on theory of operation of kprobes, see + * Documentation/trace/kprobes.rst + * + * You will see the trace data in /var/log/messages and on the console + * whenever kernel_clone() is invoked to create a new process. + */ + +#define pr_fmt(fmt) "%s: " fmt, __func__ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fprobe.h> +#include <linux/sched/debug.h> +#include <linux/slab.h> + +#define BACKTRACE_DEPTH 16 +#define MAX_SYMBOL_LEN 4096 +static struct fprobe sample_probe; +static unsigned long nhit; + +static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; +module_param_string(symbol, symbol, sizeof(symbol), 0644); +MODULE_PARM_DESC(symbol, "Probed symbol(s), given by comma separated symbols or a wildcard pattern."); + +static char nosymbol[MAX_SYMBOL_LEN] = ""; +module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644); +MODULE_PARM_DESC(nosymbol, "Not-probed symbols, given by a wildcard pattern."); + +static bool stackdump = true; +module_param(stackdump, bool, 0644); +MODULE_PARM_DESC(stackdump, "Enable stackdump."); + +static bool use_trace = false; +module_param(use_trace, bool, 0644); +MODULE_PARM_DESC(use_trace, "Use trace_printk instead of printk. This is only for debugging."); + +static void show_backtrace(void) +{ + unsigned long stacks[BACKTRACE_DEPTH]; + unsigned int len; + + len = stack_trace_save(stacks, BACKTRACE_DEPTH, 2); + stack_trace_print(stacks, len, 24); +} + +static int sample_entry_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, + struct ftrace_regs *fregs, void *data) +{ + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + else + pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip); + nhit++; + if (stackdump) + show_backtrace(); + return 0; +} + +static void sample_exit_handler(struct fprobe *fp, unsigned long ip, + unsigned long ret_ip, struct ftrace_regs *regs, + void *data) +{ + unsigned long rip = ret_ip; + + if (use_trace) + /* + * This is just an example, no kernel code should call + * trace_printk() except when actively debugging. + */ + trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + else + pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n", + (void *)ip, (void *)ip, (void *)rip, (void *)rip); + nhit++; + if (stackdump) + show_backtrace(); +} + +static int __init fprobe_init(void) +{ + char *p, *symbuf = NULL; + const char **syms; + int ret, count, i; + + sample_probe.entry_handler = sample_entry_handler; + sample_probe.exit_handler = sample_exit_handler; + + if (strchr(symbol, '*')) { + /* filter based fprobe */ + ret = register_fprobe(&sample_probe, symbol, + nosymbol[0] == '\0' ? NULL : nosymbol); + goto out; + } else if (!strchr(symbol, ',')) { + symbuf = symbol; + ret = register_fprobe_syms(&sample_probe, (const char **)&symbuf, 1); + goto out; + } + + /* Comma separated symbols */ + symbuf = kstrdup(symbol, GFP_KERNEL); + if (!symbuf) + return -ENOMEM; + p = symbuf; + count = 1; + while ((p = strchr(++p, ',')) != NULL) + count++; + + pr_info("%d symbols found\n", count); + + syms = kcalloc(count, sizeof(char *), GFP_KERNEL); + if (!syms) { + kfree(symbuf); + return -ENOMEM; + } + + p = symbuf; + for (i = 0; i < count; i++) + syms[i] = strsep(&p, ","); + + ret = register_fprobe_syms(&sample_probe, syms, count); + kfree(syms); + kfree(symbuf); +out: + if (ret < 0) + pr_err("register_fprobe failed, returned %d\n", ret); + else + pr_info("Planted fprobe at %s\n", symbol); + + return ret; +} + +static void __exit fprobe_exit(void) +{ + unregister_fprobe(&sample_probe); + + pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n", + symbol, nhit, sample_probe.nmissed); +} + +module_init(fprobe_init) +module_exit(fprobe_exit) +MODULE_DESCRIPTION("sample kernel module showing the use of fprobe"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile new file mode 100644 index 000000000000..589baf2ec4e3 --- /dev/null +++ b/samples/ftrace/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace-ops.o + +CFLAGS_sample-trace-array.o := -I$(src) +obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c new file mode 100644 index 000000000000..da3a9f2091f5 --- /dev/null +++ b/samples/ftrace/ftrace-direct-modify.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/ftrace.h> +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) +#include <asm/asm-offsets.h> +#endif + +extern void my_direct_func1(void); +extern void my_direct_func2(void); + +void my_direct_func1(void) +{ + trace_printk("my direct func1\n"); +} + +void my_direct_func2(void) +{ + trace_printk("my direct func2\n"); +} + +extern void my_tramp1(void *); +extern void my_tramp2(void *); + +static unsigned long my_ip = (unsigned long)schedule; + +#ifdef CONFIG_RISCV +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi sp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" call my_direct_func1\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addi sp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" + +" my_tramp2:\n" +" addi sp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" call my_direct_func2\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addi sp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> +#include <asm/nospec-branch.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" call my_direct_func1\n" +" leave\n" + ASM_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" call my_direct_func2\n" +" leave\n" + ASM_RET +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +#ifdef CONFIG_ARM64 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" hint 34\n" // bti c +" sub sp, sp, #16\n" +" stp x9, x30, [sp]\n" +" bl my_direct_func1\n" +" ldp x30, x9, [sp]\n" +" add sp, sp, #16\n" +" ret x9\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" hint 34\n" // bti c +" sub sp, sp, #16\n" +" stp x9, x30, [sp]\n" +" bl my_direct_func2\n" +" ldp x30, x9, [sp]\n" +" add sp, sp, #16\n" +" ret x9\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_ARM64 */ + +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi.d $sp, $sp, -16\n" +" st.d $t0, $sp, 0\n" +" st.d $ra, $sp, 8\n" +" bl my_direct_func1\n" +" ld.d $t0, $sp, 0\n" +" ld.d $ra, $sp, 8\n" +" addi.d $sp, $sp, 16\n" +" jr $t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi.d $sp, $sp, -16\n" +" st.d $t0, $sp, 0\n" +" st.d $ra, $sp, 8\n" +" bl my_direct_func2\n" +" ld.d $t0, $sp, 0\n" +" ld.d $ra, $sp, 8\n" +" addi.d $sp, $sp, 16\n" +" jr $t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + +#ifdef CONFIG_PPC +#include <asm/ppc_asm.h> + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 48 +#else +#define STACK_FRAME_SIZE 24 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr 12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99: .quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr 0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr 0\n" \ +" bctr\n" +#endif + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE +" bl my_direct_func1\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE +" bl my_direct_func2\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET + PPC64_TOC +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_PPC */ + +static struct ftrace_ops direct; + +static unsigned long my_tramp = (unsigned long)my_tramp1; +static unsigned long tramps[2] = { + (unsigned long)my_tramp1, + (unsigned long)my_tramp2, +}; + +static int simple_thread(void *arg) +{ + static int t; + int ret = 0; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2 * HZ); + + if (ret) + continue; + t ^= 1; + ret = modify_ftrace_direct(&direct, tramps[t]); + if (!ret) + my_tramp = tramps[t]; + WARN_ON_ONCE(ret); + } + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init ftrace_direct_init(void) +{ + int ret; + + ftrace_set_filter_ip(&direct, (unsigned long) my_ip, 0, 0); + ret = register_ftrace_direct(&direct, my_tramp); + + if (!ret) + simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); + return ret; +} + +static void __exit ftrace_direct_exit(void) +{ + kthread_stop(simple_tsk); + unregister_ftrace_direct(&direct, my_tramp, true); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c new file mode 100644 index 000000000000..8f7986d698d8 --- /dev/null +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/ftrace.h> +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) +#include <asm/asm-offsets.h> +#endif + +extern void my_direct_func1(unsigned long ip); +extern void my_direct_func2(unsigned long ip); + +void my_direct_func1(unsigned long ip) +{ + trace_printk("my direct func1 ip %lx\n", ip); +} + +void my_direct_func2(unsigned long ip) +{ + trace_printk("my direct func2 ip %lx\n", ip); +} + +extern void my_tramp1(void *); +extern void my_tramp2(void *); + +#ifdef CONFIG_RISCV +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func1\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func2\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> +#include <asm/nospec-branch.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func1\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func2\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +"\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +#ifdef CONFIG_ARM64 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" hint 34\n" // bti c +" sub sp, sp, #32\n" +" stp x9, x30, [sp]\n" +" str x0, [sp, #16]\n" +" mov x0, x30\n" +" bl my_direct_func1\n" +" ldp x30, x9, [sp]\n" +" ldr x0, [sp, #16]\n" +" add sp, sp, #32\n" +" ret x9\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" hint 34\n" // bti c +" sub sp, sp, #32\n" +" stp x9, x30, [sp]\n" +" str x0, [sp, #16]\n" +" mov x0, x30\n" +" bl my_direct_func2\n" +" ldp x30, x9, [sp]\n" +" ldr x0, [sp, #16]\n" +" add sp, sp, #32\n" +" ret x9\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_ARM64 */ + +#ifdef CONFIG_LOONGARCH +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func1\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func2\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + +#ifdef CONFIG_PPC +#include <asm/ppc_asm.h> + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 48 +#else +#define STACK_FRAME_SIZE 24 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr 12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99: .quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr 0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#define PPC_FTRACE_RECOVER_IP \ +" lwz 8, 4(3)\n" \ +" li 9, 6\n" \ +" slw 8, 8, 9\n" \ +" sraw 8, 8, 9\n" \ +" add 3, 3, 8\n" \ +" addi 3, 3, 4\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr 0\n" \ +" bctr\n" +#define PPC_FTRACE_RECOVER_IP "" +#endif + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE + PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mr 3, 0\n" + PPC_FTRACE_RECOVER_IP +" bl my_direct_func1\n" + PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE + PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mr 3, 0\n" + PPC_FTRACE_RECOVER_IP +" bl my_direct_func2\n" + PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET + PPC64_TOC + " .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_PPC */ + +static unsigned long my_tramp = (unsigned long)my_tramp1; +static unsigned long tramps[2] = { + (unsigned long)my_tramp1, + (unsigned long)my_tramp2, +}; + +static struct ftrace_ops direct; + +static int simple_thread(void *arg) +{ + static int t; + int ret = 0; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2 * HZ); + + if (ret) + continue; + t ^= 1; + ret = modify_ftrace_direct(&direct, tramps[t]); + if (!ret) + my_tramp = tramps[t]; + WARN_ON_ONCE(ret); + } + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init ftrace_direct_multi_init(void) +{ + int ret; + + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0); + + ret = register_ftrace_direct(&direct, my_tramp); + + if (!ret) + simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); + return ret; +} + +static void __exit ftrace_direct_multi_exit(void) +{ + kthread_stop(simple_tsk); + unregister_ftrace_direct(&direct, my_tramp, true); +} + +module_init(ftrace_direct_multi_init); +module_exit(ftrace_direct_multi_exit); + +MODULE_AUTHOR("Jiri Olsa"); +MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c new file mode 100644 index 000000000000..db326c81a27d --- /dev/null +++ b/samples/ftrace/ftrace-direct-multi.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/mm.h> /* for handle_mm_fault() */ +#include <linux/ftrace.h> +#include <linux/sched/stat.h> +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) +#include <asm/asm-offsets.h> +#endif + +extern void my_direct_func(unsigned long ip); + +void my_direct_func(unsigned long ip) +{ + trace_printk("ip %lx\n", ip); +} + +extern void my_tramp(void *); + +#ifdef CONFIG_RISCV +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> +#include <asm/nospec-branch.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +#ifdef CONFIG_ARM64 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" hint 34\n" // bti c +" sub sp, sp, #32\n" +" stp x9, x30, [sp]\n" +" str x0, [sp, #16]\n" +" mov x0, x30\n" +" bl my_direct_func\n" +" ldp x30, x9, [sp]\n" +" ldr x0, [sp, #16]\n" +" add sp, sp, #32\n" +" ret x9\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_ARM64 */ + +#ifdef CONFIG_LOONGARCH + +#include <asm/asm.h> +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" move $a0, $t0\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + +#ifdef CONFIG_PPC +#include <asm/ppc_asm.h> + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 48 +#else +#define STACK_FRAME_SIZE 24 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr 12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99: .quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr 0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#define PPC_FTRACE_RECOVER_IP \ +" lwz 8, 4(3)\n" \ +" li 9, 6\n" \ +" slw 8, 8, 9\n" \ +" sraw 8, 8, 9\n" \ +" add 3, 3, 8\n" \ +" addi 3, 3, 4\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr 0\n" \ +" bctr\n" +#define PPC_FTRACE_RECOVER_IP "" +#endif + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE + PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mr 3, 0\n" + PPC_FTRACE_RECOVER_IP +" bl my_direct_func\n" + PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET + PPC64_TOC +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_PPC */ + +static struct ftrace_ops direct; + +static int __init ftrace_direct_multi_init(void) +{ + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0); + + return register_ftrace_direct(&direct, (unsigned long) my_tramp); +} + +static void __exit ftrace_direct_multi_exit(void) +{ + unregister_ftrace_direct(&direct, (unsigned long) my_tramp, true); +} + +module_init(ftrace_direct_multi_init); +module_exit(ftrace_direct_multi_exit); + +MODULE_AUTHOR("Jiri Olsa"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct_multi()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c new file mode 100644 index 000000000000..3d0fa260332d --- /dev/null +++ b/samples/ftrace/ftrace-direct-too.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/mm.h> /* for handle_mm_fault() */ +#include <linux/ftrace.h> +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) +#include <asm/asm-offsets.h> +#endif + +extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs); + +void my_direct_func(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs) +{ + trace_printk("handle mm fault vma=%p address=%lx flags=%x regs=%p\n", + vma, address, flags, regs); +} + +extern void my_tramp(void *); + +#ifdef CONFIG_RISCV +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-5*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" a1,1*"SZREG"(sp)\n" +" "REG_S" a2,2*"SZREG"(sp)\n" +" "REG_S" t0,3*"SZREG"(sp)\n" +" "REG_S" ra,4*"SZREG"(sp)\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" a1,1*"SZREG"(sp)\n" +" "REG_L" a2,2*"SZREG"(sp)\n" +" "REG_L" t0,3*"SZREG"(sp)\n" +" "REG_L" ra,4*"SZREG"(sp)\n" +" addi sp,sp,5*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> +#include <asm/nospec-branch.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" pushq %rdi\n" +" pushq %rsi\n" +" pushq %rdx\n" +" pushq %rcx\n" +" call my_direct_func\n" +" popq %rcx\n" +" popq %rdx\n" +" popq %rsi\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +#ifdef CONFIG_ARM64 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" hint 34\n" // bti c +" sub sp, sp, #48\n" +" stp x9, x30, [sp]\n" +" stp x0, x1, [sp, #16]\n" +" stp x2, x3, [sp, #32]\n" +" bl my_direct_func\n" +" ldp x30, x9, [sp]\n" +" ldp x0, x1, [sp, #16]\n" +" ldp x2, x3, [sp, #32]\n" +" add sp, sp, #48\n" +" ret x9\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_ARM64 */ + +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -48\n" +" st.d $a0, $sp, 0\n" +" st.d $a1, $sp, 8\n" +" st.d $a2, $sp, 16\n" +" st.d $t0, $sp, 24\n" +" st.d $ra, $sp, 32\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $a1, $sp, 8\n" +" ld.d $a2, $sp, 16\n" +" ld.d $t0, $sp, 24\n" +" ld.d $ra, $sp, 32\n" +" addi.d $sp, $sp, 48\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + +#ifdef CONFIG_PPC +#include <asm/ppc_asm.h> + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 64 +#define STACK_FRAME_ARG1 32 +#define STACK_FRAME_ARG2 40 +#define STACK_FRAME_ARG3 48 +#define STACK_FRAME_ARG4 56 +#else +#define STACK_FRAME_SIZE 32 +#define STACK_FRAME_ARG1 16 +#define STACK_FRAME_ARG2 20 +#define STACK_FRAME_ARG3 24 +#define STACK_FRAME_ARG4 28 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr 12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99: .quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr 0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr 0\n" \ +" bctr\n" +#endif + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE + PPC_STL" 3, "__stringify(STACK_FRAME_ARG1)"(1)\n" + PPC_STL" 4, "__stringify(STACK_FRAME_ARG2)"(1)\n" + PPC_STL" 5, "__stringify(STACK_FRAME_ARG3)"(1)\n" + PPC_STL" 6, "__stringify(STACK_FRAME_ARG4)"(1)\n" +" bl my_direct_func\n" + PPC_LL" 6, "__stringify(STACK_FRAME_ARG4)"(1)\n" + PPC_LL" 5, "__stringify(STACK_FRAME_ARG3)"(1)\n" + PPC_LL" 4, "__stringify(STACK_FRAME_ARG2)"(1)\n" + PPC_LL" 3, "__stringify(STACK_FRAME_ARG1)"(1)\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET + PPC64_TOC +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_PPC */ + +static struct ftrace_ops direct; + +static int __init ftrace_direct_init(void) +{ + ftrace_set_filter_ip(&direct, (unsigned long) handle_mm_fault, 0, 0); + + return register_ftrace_direct(&direct, (unsigned long) my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c new file mode 100644 index 000000000000..956834b0d19a --- /dev/null +++ b/samples/ftrace/ftrace-direct.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> + +#include <linux/sched.h> /* for wake_up_process() */ +#include <linux/ftrace.h> +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) +#include <asm/asm-offsets.h> +#endif + +extern void my_direct_func(struct task_struct *p); + +void my_direct_func(struct task_struct *p) +{ + trace_printk("waking up %s-%d\n", p->comm, p->pid); +} + +extern void my_tramp(void *); + +#ifdef CONFIG_RISCV +#include <asm/asm.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + +#ifdef CONFIG_X86_64 + +#include <asm/ibt.h> +#include <asm/nospec-branch.h> + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" + ASM_ENDBR +" pushq %rbp\n" +" movq %rsp, %rbp\n" + CALL_DEPTH_ACCOUNT +" pushq %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" + ASM_RET +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + +#ifdef CONFIG_ARM64 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" hint 34\n" // bti c +" sub sp, sp, #32\n" +" stp x9, x30, [sp]\n" +" str x0, [sp, #16]\n" +" bl my_direct_func\n" +" ldp x30, x9, [sp]\n" +" ldr x0, [sp, #16]\n" +" add sp, sp, #32\n" +" ret x9\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_ARM64 */ + +#ifdef CONFIG_LOONGARCH + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi.d $sp, $sp, -32\n" +" st.d $a0, $sp, 0\n" +" st.d $t0, $sp, 8\n" +" st.d $ra, $sp, 16\n" +" bl my_direct_func\n" +" ld.d $a0, $sp, 0\n" +" ld.d $t0, $sp, 8\n" +" ld.d $ra, $sp, 16\n" +" addi.d $sp, $sp, 32\n" +" jr $t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_LOONGARCH */ + +#ifdef CONFIG_PPC +#include <asm/ppc_asm.h> + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 48 +#else +#define STACK_FRAME_SIZE 24 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr 12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99: .quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr 0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr 0\n" \ +" bctr\n" +#endif + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr 0\n" + PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE + PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" bl my_direct_func\n" + PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" + PPC64_TOC_RESTORE +" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET + PPC64_TOC +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_PPC */ + +static struct ftrace_ops direct; + +static int __init ftrace_direct_init(void) +{ + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + + return register_ftrace_direct(&direct, (unsigned long) my_tramp); +} + +static void __exit ftrace_direct_exit(void) +{ + unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true); +} + +module_init(ftrace_direct_init); +module_exit(ftrace_direct_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c new file mode 100644 index 000000000000..68d6685c80bd --- /dev/null +++ b/samples/ftrace/ftrace-ops.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/ftrace.h> +#include <linux/ktime.h> +#include <linux/module.h> + +#include <asm/barrier.h> + +/* + * Arbitrary large value chosen to be sufficiently large to minimize noise but + * sufficiently small to complete quickly. + */ +static unsigned int nr_function_calls = 100000; +module_param(nr_function_calls, uint, 0); +MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee"); + +/* + * The number of ops associated with a call site affects whether a tracer can + * be called directly or whether it's necessary to go via the list func, which + * can be significantly more expensive. + */ +static unsigned int nr_ops_relevant = 1; +module_param(nr_ops_relevant, uint, 0); +MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the relevant tracee"); + +/* + * On architectures where all call sites share the same trampoline, having + * tracers enabled for distinct functions can force the use of the list func + * and incur overhead for all call sites. + */ +static unsigned int nr_ops_irrelevant; +module_param(nr_ops_irrelevant, uint, 0); +MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the irrelevant tracee"); + +/* + * On architectures with DYNAMIC_FTRACE_WITH_REGS, saving the full pt_regs can + * be more expensive than only saving the minimal necessary regs. + */ +static bool save_regs; +module_param(save_regs, bool, 0); +MODULE_PARM_DESC(save_regs, "Register ops with FTRACE_OPS_FL_SAVE_REGS (save all registers in the trampoline)"); + +static bool assist_recursion; +module_param(assist_recursion, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RECURSION"); + +static bool assist_rcu; +module_param(assist_rcu, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU"); + +/* + * By default, a trivial tracer is used which immediately returns to mimimize + * overhead. Sometimes a consistency check using a more expensive tracer is + * desireable. + */ +static bool check_count; +module_param(check_count, bool, 0); +MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number of times\n"); + +/* + * Usually it's not interesting to leave the ops registered after the test + * runs, but sometimes it can be useful to leave them registered so that they + * can be inspected through the tracefs 'enabled_functions' file. + */ +static bool persist; +module_param(persist, bool, 0); +MODULE_PARM_DESC(persist, "Successfully load module and leave ftrace ops registered after test completes\n"); + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_relevant(void) +{ + barrier(); +} + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_irrelevant(void) +{ + barrier(); +} + +struct sample_ops { + struct ftrace_ops ops; + unsigned int count; +}; + +static void ops_func_nop(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* do nothing */ +} + +static void ops_func_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct sample_ops *self; + + self = container_of(op, struct sample_ops, ops); + self->count++; +} + +static struct sample_ops *ops_relevant; +static struct sample_ops *ops_irrelevant; + +static struct sample_ops *ops_alloc_init(void *tracee, ftrace_func_t func, + unsigned long flags, int nr) +{ + struct sample_ops *ops; + + ops = kcalloc(nr, sizeof(*ops), GFP_KERNEL); + if (WARN_ON_ONCE(!ops)) + return NULL; + + for (unsigned int i = 0; i < nr; i++) { + ops[i].ops.func = func; + ops[i].ops.flags = flags; + WARN_ON_ONCE(ftrace_set_filter_ip(&ops[i].ops, (unsigned long)tracee, 0, 0)); + WARN_ON_ONCE(register_ftrace_function(&ops[i].ops)); + } + + return ops; +} + +static void ops_destroy(struct sample_ops *ops, int nr) +{ + if (!ops) + return; + + for (unsigned int i = 0; i < nr; i++) { + WARN_ON_ONCE(unregister_ftrace_function(&ops[i].ops)); + ftrace_free_filter(&ops[i].ops); + } + + kfree(ops); +} + +static void ops_check(struct sample_ops *ops, int nr, + unsigned int expected_count) +{ + if (!ops || !check_count) + return; + + for (unsigned int i = 0; i < nr; i++) { + if (ops->count == expected_count) + continue; + pr_warn("Counter called %u times (expected %u)\n", + ops->count, expected_count); + } +} + +static ftrace_func_t tracer_relevant = ops_func_nop; +static ftrace_func_t tracer_irrelevant = ops_func_nop; + +static int __init ftrace_ops_sample_init(void) +{ + unsigned long flags = 0; + ktime_t start, end; + u64 period; + + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && save_regs) { + pr_info("this kernel does not support saving registers\n"); + save_regs = false; + } else if (save_regs) { + flags |= FTRACE_OPS_FL_SAVE_REGS; + } + + if (assist_recursion) + flags |= FTRACE_OPS_FL_RECURSION; + + if (assist_rcu) + flags |= FTRACE_OPS_FL_RCU; + + if (check_count) { + tracer_relevant = ops_func_count; + tracer_irrelevant = ops_func_count; + } + + pr_info("registering:\n" + " relevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " irrelevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " saving registers: %s\n" + " assist recursion: %s\n" + " assist RCU: %s\n", + nr_ops_relevant, tracee_relevant, tracer_relevant, + nr_ops_irrelevant, tracee_irrelevant, tracer_irrelevant, + save_regs ? "YES" : "NO", + assist_recursion ? "YES" : "NO", + assist_rcu ? "YES" : "NO"); + + ops_relevant = ops_alloc_init(tracee_relevant, tracer_relevant, + flags, nr_ops_relevant); + ops_irrelevant = ops_alloc_init(tracee_irrelevant, tracer_irrelevant, + flags, nr_ops_irrelevant); + + start = ktime_get(); + for (unsigned int i = 0; i < nr_function_calls; i++) + tracee_relevant(); + end = ktime_get(); + + ops_check(ops_relevant, nr_ops_relevant, nr_function_calls); + ops_check(ops_irrelevant, nr_ops_irrelevant, 0); + + period = ktime_to_ns(ktime_sub(end, start)); + + pr_info("Attempted %u calls to %ps in %lluns (%lluns / call)\n", + nr_function_calls, tracee_relevant, + period, div_u64(period, nr_function_calls)); + + if (persist) + return 0; + + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); + + /* + * The benchmark completed sucessfully, but there's no reason to keep + * the module around. Return an error do the user doesn't have to + * manually unload the module. + */ + return -EINVAL; +} +module_init(ftrace_ops_sample_init); + +static void __exit ftrace_ops_sample_exit(void) +{ + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); +} +module_exit(ftrace_ops_sample_exit); + +MODULE_AUTHOR("Mark Rutland"); +MODULE_DESCRIPTION("Example of using custom ftrace_ops"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c new file mode 100644 index 000000000000..4147616102f9 --- /dev/null +++ b/samples/ftrace/sample-trace-array.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace.h> +#include <linux/trace_events.h> +#include <linux/timer.h> +#include <linux/err.h> +#include <linux/jiffies.h> +#include <linux/workqueue.h> + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "sample-trace-array.h" + +struct trace_array *tr; +static void mytimer_handler(struct timer_list *unused); +static struct task_struct *simple_tsk; + +static void trace_work_fn(struct work_struct *work) +{ + /* + * Disable tracing for event "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", + false); +} +static DECLARE_WORK(trace_work, trace_work_fn); + +/* + * mytimer: Timer setup to disable tracing for event "sample_event". This + * timer is only for the purposes of the sample module to demonstrate access of + * Ftrace instances from within kernel. + */ +static DEFINE_TIMER(mytimer, mytimer_handler); + +static void mytimer_handler(struct timer_list *unused) +{ + schedule_work(&trace_work); +} + +static void simple_thread_func(int count) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + + /* + * Printing count value using trace_array_printk() - trace_printk() + * equivalent for the instance buffers. + */ + trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n", + count); + /* + * Tracepoint for event "sample_event". This will print the + * current value of count and current jiffies. + */ + trace_sample_event(count, jiffies); +} + +static int simple_thread(void *arg) +{ + int count = 0; + unsigned long delay = msecs_to_jiffies(5000); + + /* + * Enable tracing for "sample_event". + */ + trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true); + + /* + * Adding timer - mytimer. This timer will disable tracing after + * delay seconds. + * + */ + add_timer(&mytimer); + mod_timer(&mytimer, jiffies+delay); + + while (!kthread_should_stop()) + simple_thread_func(count++); + + timer_delete(&mytimer); + cancel_work_sync(&trace_work); + + /* + * trace_array_put() decrements the reference counter associated with + * the trace array - "tr". We are done using the trace array, hence + * decrement the reference counter so that it can be destroyed using + * trace_array_destroy(). + */ + trace_array_put(tr); + + return 0; +} + +static int __init sample_trace_array_init(void) +{ + /* + * Return a pointer to the trace array with name "sample-instance" if it + * exists, else create a new trace array. + * + * NOTE: This function increments the reference counter + * associated with the trace array - "tr". + */ + tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes"); + + if (!tr) + return -1; + /* + * If context specific per-cpu buffers havent already been allocated. + */ + trace_array_init_printk(tr); + + simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); + if (IS_ERR(simple_tsk)) { + trace_array_put(tr); + trace_array_destroy(tr); + return -1; + } + + return 0; +} + +static void __exit sample_trace_array_exit(void) +{ + kthread_stop(simple_tsk); + + /* + * We are unloading our module and no longer require the trace array. + * Remove/destroy "tr" using trace_array_destroy() + */ + trace_array_destroy(tr); +} + +module_init(sample_trace_array_init); +module_exit(sample_trace_array_exit); + +MODULE_AUTHOR("Divya Indi"); +MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances"); +MODULE_LICENSE("GPL"); diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h new file mode 100644 index 000000000000..6f8962428158 --- /dev/null +++ b/samples/ftrace/sample-trace-array.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /sys/kernel/tracing/events/<system> + * + * The define_trace.h below will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * In this case, it would look for sample-trace.h + * + * If the header name will be different than the system name + * (as in this case), then you can override the header name that + * define_trace.h will look up by defining TRACE_INCLUDE_FILE + * + * This file is called sample-trace-array.h but we want the system + * to be called "sample-subsystem". Therefore we must define the name of this + * file: + * + * #define TRACE_INCLUDE_FILE sample-trace-array + * + * As we do in the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sample-subsystem + +/* + * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric + * and underscore), although it may start with numbers. If for some + * reason it is not, you need to add the following lines: + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR sample_subsystem + +/* + * But the above is only needed if TRACE_SYSTEM is not alpha-numeric + * and underscored. By default, TRACE_SYSTEM_VAR will be equal to + * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if + * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with + * only alpha-numeric and underscores. + * + * The TRACE_SYSTEM_VAR is only used internally and not visible to + * user space. + */ + +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _SAMPLE_TRACE_ARRAY_H + +#include <linux/tracepoint.h> +TRACE_EVENT(sample_event, + + TP_PROTO(int count, unsigned long time), + + TP_ARGS(count, time), + + TP_STRUCT__entry( + __field(int, count) + __field(unsigned long, time) + ), + + TP_fast_assign( + __entry->count = count; + __entry->time = time; + ), + + TP_printk("count value=%d at jiffies=%lu", __entry->count, + __entry->time) + ); +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE sample-trace-array +#include <trace/define_trace.h> diff --git a/samples/hid/.gitignore b/samples/hid/.gitignore new file mode 100644 index 000000000000..3ea0fed3bbad --- /dev/null +++ b/samples/hid/.gitignore @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +hid_mouse +hid_surface_dial +*.out +*.skel.h +/vmlinux.h +/bpftool/ +/libbpf/ diff --git a/samples/hid/Makefile b/samples/hid/Makefile new file mode 100644 index 000000000000..db5a077c77fc --- /dev/null +++ b/samples/hid/Makefile @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: GPL-2.0 + +HID_SAMPLES_PATH ?= $(abspath $(src)) +TOOLS_PATH := $(HID_SAMPLES_PATH)/../../tools + +pound := \# + +# List of programs to build +tprogs-y += hid_mouse +tprogs-y += hid_surface_dial + +# Libbpf dependencies +LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf +LIBBPF_OUTPUT = $(abspath $(HID_SAMPLES_PATH))/libbpf +LIBBPF_DESTDIR = $(LIBBPF_OUTPUT) +LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include +LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a + +EXTRA_BPF_HEADERS := hid_bpf_helpers.h + +hid_mouse-objs := hid_mouse.o +hid_surface_dial-objs := hid_surface_dial.o + +# Tell kbuild to always build the programs +always-y := $(tprogs-y) + +ifeq ($(ARCH), arm) +# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux +# headers when arm instruction set identification is requested. +ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS)) +BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) +TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) +endif + +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +ifdef CONFIG_MACH_LOONGSON64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64 +BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic +endif +endif + +COMMON_CFLAGS += -Wall -O2 +COMMON_CFLAGS += -Wmissing-prototypes +COMMON_CFLAGS += -Wstrict-prototypes + +TPROGS_CFLAGS += $(COMMON_CFLAGS) +TPROGS_CFLAGS += -I$(objtree)/usr/include +TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +TPROGS_CFLAGS += -I$(srctree)/tools/include + +ifdef SYSROOT +COMMON_CFLAGS += --sysroot=$(SYSROOT) +TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib +endif + +TPROGS_LDLIBS += $(LIBBPF) -lelf -lz + +# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang +LLC ?= llc +CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis +LLVM_OBJCOPY ?= llvm-objcopy +LLVM_READELF ?= llvm-readelf +BTF_PAHOLE ?= pahole + +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif + +# Don't evaluate probes and warnings if we need to run make recursively +ifneq ($(src),) +HDR_PROBE := $(shell printf "$(pound)include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ + $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \ + -o /dev/null 2>/dev/null && echo okay) + +ifeq ($(HDR_PROBE),) +$(warning WARNING: Detected possible issues with include path.) +$(warning WARNING: Please install kernel headers locally (make headers_install).) +endif + +BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) +BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) +BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') +BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ + $(CLANG) --target=bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \ + $(LLVM_READELF) -S ./llvm_btf_verify.o | grep BTF; \ + /bin/rm -f ./llvm_btf_verify.o) + +BPF_EXTRA_CFLAGS += -fno-stack-protector +ifneq ($(BTF_LLVM_PROBE),) + BPF_EXTRA_CFLAGS += -g +else +ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) + BPF_EXTRA_CFLAGS += -g + LLC_FLAGS += -mattr=dwarfris + DWARF2BTF = y +endif +endif +endif + +# Trick to allow make to be run from this directory +all: + $(MAKE) -C ../../ M=$(CURDIR) HID_SAMPLES_PATH=$(CURDIR) + +clean: + $(MAKE) -C ../../ M=$(CURDIR) clean + @find $(CURDIR) -type f -name '*~' -delete + @$(RM) -r $(CURDIR)/libbpf $(CURDIR)/bpftool + +$(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) +# Fix up variables inherited from Kbuild that tools/ build system won't like + $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(HID_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers + +BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool +BPFTOOL_OUTPUT := $(abspath $(HID_SAMPLES_PATH))/bpftool +BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool +$(BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) | $(BPFTOOL_OUTPUT) + $(MAKE) -C $(BPFTOOLDIR) srctree=$(HID_SAMPLES_PATH)/../../ \ + OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap + +$(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +FORCE: + + +# Verify LLVM compiler tools are available and bpf target is supported by llc +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) + +verify_cmds: $(CLANG) $(LLC) + @for TOOL in $^ ; do \ + if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ + echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ + exit 1; \ + else true; fi; \ + done + +verify_target_bpf: verify_cmds + @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ + echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ + echo " NOTICE: LLVM version >= 3.7.1 required" ;\ + exit 2; \ + else true; fi + +$(HID_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) +$(src)/*.c: verify_target_bpf $(LIBBPF) + +libbpf_hdrs: $(LIBBPF) + +.PHONY: libbpf_hdrs + +$(obj)/hid_mouse.o: $(obj)/hid_mouse.skel.h +$(obj)/hid_surface_dial.o: $(obj)/hid_surface_dial.skel.h + +-include $(HID_SAMPLES_PATH)/Makefile.target + +VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ + $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ + $(abspath $(objtree)/vmlinux) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +ifeq ($(VMLINUX_H),) +ifeq ($(VMLINUX_BTF),) + $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ + build the kernel or set VMLINUX_BTF or VMLINUX_H variable) +endif + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +clean-files += vmlinux.h + +# Get Clang's default includes on this system, as opposed to those seen by +# '--target=bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/') +endef + +CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) + +EXTRA_BPF_HEADERS_SRC := $(addprefix $(src)/,$(EXTRA_BPF_HEADERS)) + +$(obj)/%.bpf.o: $(src)/%.bpf.c $(EXTRA_BPF_HEADERS_SRC) $(obj)/vmlinux.h + @echo " CLANG-BPF " $@ + $(Q)$(CLANG) -g -O2 --target=bpf -D__TARGET_ARCH_$(SRCARCH) \ + -Wno-compare-distinct-pointer-types -I$(srctree)/include \ + -I$(srctree)/samples/bpf -I$(srctree)/tools/include \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ + -c $(filter %.bpf.c,$^) -o $@ + +LINKED_SKELS := hid_mouse.skel.h hid_surface_dial.skel.h +clean-files += $(LINKED_SKELS) + +hid_mouse.skel.h-deps := hid_mouse.bpf.o +hid_surface_dial.skel.h-deps := hid_surface_dial.bpf.o + +LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) + +BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c)) +BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED)) +BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS)) + +$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL) + @echo " BPF GEN-OBJ " $(@:.skel.h=) + $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps)) + @echo " BPF GEN-SKEL" $(@:.skel.h=) + $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@ + +# asm/sysreg.h - inline assembly used by it is incompatible with llvm. +# But, there is no easy way to fix it, so just exclude it since it is +# useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. +$(obj)/%.o: $(src)/%.c + @echo " CLANG-bpf " $@ + $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ + -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ + -I$(LIBBPF_INCLUDE) \ + -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ + -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ + -fno-asynchronous-unwind-tables \ + -I$(srctree)/samples/hid/ \ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ +ifeq ($(DWARF2BTF),y) + $(BTF_PAHOLE) -J $@ +endif diff --git a/samples/hid/Makefile.target b/samples/hid/Makefile.target new file mode 100644 index 000000000000..7621f55e2947 --- /dev/null +++ b/samples/hid/Makefile.target @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# ========================================================================== +# Building binaries on the host system +# Binaries are not used during the compilation of the kernel, and intended +# to be build for target board, target board can be host of course. Added to +# build binaries to run not on host system. +# +# Sample syntax +# tprogs-y := xsk_example +# Will compile xsk_example.c and create an executable named xsk_example +# +# tprogs-y := xdpsock +# xdpsock-objs := xdpsock_1.o xdpsock_2.o +# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable +# xdpsock, based on xdpsock_1.o and xdpsock_2.o +# +# Derived from scripts/Makefile.host +# +__tprogs := $(sort $(tprogs-y)) + +# C code +# Executables compiled from a single .c file +tprog-csingle := $(foreach m,$(__tprogs), \ + $(if $($(m)-objs),,$(m))) + +# C executables linked based on several .o files +tprog-cmulti := $(foreach m,$(__tprogs),\ + $(if $($(m)-objs),$(m))) + +# Object (.o) files compiled from .c files +tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs))) + +tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle)) +tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti)) +tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs)) + +##### +# Handle options to gcc. Support building with separate output directory + +_tprogc_flags = $(TPROGS_CFLAGS) \ + $(TPROGCFLAGS_$(basetarget).o) + +# $(objtree)/$(obj) for including generated headers from checkin source files +ifeq ($(KBUILD_EXTMOD),) +ifdef building_out_of_srctree +_tprogc_flags += -I $(objtree)/$(obj) +endif +endif + +tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags) + +# Create executable from a single .c file +# tprog-csingle -> Executable +quiet_cmd_tprog-csingle = CC $@ + cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE + $(call if_changed_dep,tprog-csingle) + +# Link an executable based on list of .o files, all plain c +# tprog-cmulti -> executable +quiet_cmd_tprog-cmulti = LD $@ + cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \ + $(addprefix $(obj)/,$($(@F)-objs)) \ + $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) +$(tprog-cmulti): $(tprog-cobjs) FORCE + $(call if_changed,tprog-cmulti) +$(call multi_depend, $(tprog-cmulti), , -objs) + +# Create .o file from a single .c file +# tprog-cobjs -> .o +quiet_cmd_tprog-cobjs = CC $@ + cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< +$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE + $(call if_changed_dep,tprog-cobjs) diff --git a/samples/hid/hid_bpf_helpers.h b/samples/hid/hid_bpf_helpers.h new file mode 100644 index 000000000000..4fff31dbe0e7 --- /dev/null +++ b/samples/hid/hid_bpf_helpers.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Benjamin Tissoires + */ + +#ifndef __HID_BPF_HELPERS_H +#define __HID_BPF_HELPERS_H + +/* following are kfuncs exported by HID for HID-BPF */ +extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx, + unsigned int offset, + const size_t __sz) __ksym; +extern int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, u32 flags) __ksym; +extern struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id) __ksym; +extern void hid_bpf_release_context(struct hid_bpf_ctx *ctx) __ksym; +extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, + __u8 *data, + size_t buf__sz, + enum hid_report_type type, + enum hid_class_request reqtype) __ksym; + +#endif /* __HID_BPF_HELPERS_H */ diff --git a/samples/hid/hid_mouse.bpf.c b/samples/hid/hid_mouse.bpf.c new file mode 100644 index 000000000000..f7f722dcf56d --- /dev/null +++ b/samples/hid/hid_mouse.bpf.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "hid_bpf_helpers.h" + +static int hid_y_event(struct hid_bpf_ctx *hctx) +{ + s16 y; + __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */); + + if (!data) + return 0; /* EPERM check */ + + bpf_printk("event: size: %d", hctx->size); + bpf_printk("incoming event: %02x %02x %02x", + data[0], + data[1], + data[2]); + bpf_printk(" %02x %02x %02x", + data[3], + data[4], + data[5]); + bpf_printk(" %02x %02x %02x", + data[6], + data[7], + data[8]); + + y = data[3] | (data[4] << 8); + + y = -y; + + data[3] = y & 0xFF; + data[4] = (y >> 8) & 0xFF; + + bpf_printk("modified event: %02x %02x %02x", + data[0], + data[1], + data[2]); + bpf_printk(" %02x %02x %02x", + data[3], + data[4], + data[5]); + bpf_printk(" %02x %02x %02x", + data[6], + data[7], + data[8]); + + return 0; +} + +static int hid_x_event(struct hid_bpf_ctx *hctx) +{ + s16 x; + __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */); + + if (!data) + return 0; /* EPERM check */ + + x = data[1] | (data[2] << 8); + + x = -x; + + data[1] = x & 0xFF; + data[2] = (x >> 8) & 0xFF; + return 0; +} + +SEC("struct_ops/hid_device_event") +int BPF_PROG(hid_event, struct hid_bpf_ctx *hctx, enum hid_report_type type) +{ + int ret = hid_y_event(hctx); + + if (ret) + return ret; + + return hid_x_event(hctx); +} + + +SEC("struct_ops/hid_rdesc_fixup") +int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hctx) +{ + __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 4096 /* size */); + + if (!data) + return 0; /* EPERM check */ + + bpf_printk("rdesc: %02x %02x %02x", + data[0], + data[1], + data[2]); + bpf_printk(" %02x %02x %02x", + data[3], + data[4], + data[5]); + bpf_printk(" %02x %02x %02x ...", + data[6], + data[7], + data[8]); + + /* + * The original report descriptor contains: + * + * 0x05, 0x01, // Usage Page (Generic Desktop) 30 + * 0x16, 0x01, 0x80, // Logical Minimum (-32767) 32 + * 0x26, 0xff, 0x7f, // Logical Maximum (32767) 35 + * 0x09, 0x30, // Usage (X) 38 + * 0x09, 0x31, // Usage (Y) 40 + * + * So byte 39 contains Usage X and byte 41 Usage Y. + * + * We simply swap the axes here. + */ + data[39] = 0x31; + data[41] = 0x30; + + return 0; +} + +SEC(".struct_ops.link") +struct hid_bpf_ops mouse_invert = { + .hid_rdesc_fixup = (void *)hid_rdesc_fixup, + .hid_device_event = (void *)hid_event, +}; + +char _license[] SEC("license") = "GPL"; diff --git a/samples/hid/hid_mouse.c b/samples/hid/hid_mouse.c new file mode 100644 index 000000000000..4b80d4e4c154 --- /dev/null +++ b/samples/hid/hid_mouse.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Benjamin Tissoires + * + * This is a pure HID-BPF example, and should be considered as such: + * on the Etekcity Scroll 6E, the X and Y axes will be swapped and + * inverted. On any other device... Not sure what this will do. + * + * This C main file is generic though. To adapt the code and test, users + * must amend only the .bpf.c file, which this program will load any + * eBPF program it finds. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <unistd.h> + +#include <linux/bpf.h> +#include <linux/errno.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "hid_mouse.skel.h" + +static bool running = true; + +static void int_exit(int sig) +{ + running = false; + exit(0); +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "%s: %s /sys/bus/hid/devices/0BUS:0VID:0PID:00ID\n\n", + __func__, prog); + fprintf(stderr, + "This program will upload and attach a HID-BPF program to the given device.\n" + "On the Etekcity Scroll 6E, the X and Y axis will be inverted, but on any other\n" + "device, chances are high that the device will not be working anymore\n\n" + "consider this as a demo and adapt the eBPF program to your needs\n" + "Hit Ctrl-C to unbind the program and reset the device\n"); +} + +static int get_hid_id(const char *path) +{ + const char *str_id, *dir; + char uevent[1024]; + int fd; + + memset(uevent, 0, sizeof(uevent)); + snprintf(uevent, sizeof(uevent) - 1, "%s/uevent", path); + + fd = open(uevent, O_RDONLY | O_NONBLOCK); + if (fd < 0) + return -ENOENT; + + close(fd); + + dir = basename((char *)path); + + str_id = dir + sizeof("0003:0001:0A37."); + return (int)strtol(str_id, NULL, 16); +} + +int main(int argc, char **argv) +{ + struct hid_mouse *skel; + struct bpf_link *link; + int err; + const char *optstr = ""; + const char *sysfs_path; + int opt, hid_id; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + default: + usage(basename(argv[0])); + return 1; + } + } + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + sysfs_path = argv[optind]; + if (!sysfs_path) { + perror("sysfs"); + return 1; + } + + skel = hid_mouse__open(); + if (!skel) { + fprintf(stderr, "%s %s:%d", __func__, __FILE__, __LINE__); + return -1; + } + + hid_id = get_hid_id(sysfs_path); + + if (hid_id < 0) { + fprintf(stderr, "can not open HID device: %m\n"); + return 1; + } + skel->struct_ops.mouse_invert->hid_id = hid_id; + + err = hid_mouse__load(skel); + if (err < 0) { + fprintf(stderr, "can not load HID-BPF program: %m\n"); + return 1; + } + + link = bpf_map__attach_struct_ops(skel->maps.mouse_invert); + if (!link) { + fprintf(stderr, "can not attach HID-BPF program: %m\n"); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (running) + sleep(1); + + hid_mouse__destroy(skel); + + return 0; +} diff --git a/samples/hid/hid_surface_dial.bpf.c b/samples/hid/hid_surface_dial.bpf.c new file mode 100644 index 000000000000..527d584812ab --- /dev/null +++ b/samples/hid/hid_surface_dial.bpf.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Benjamin Tissoires + */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "hid_bpf_helpers.h" + +#define HID_UP_BUTTON 0x0009 +#define HID_GD_WHEEL 0x0038 + +SEC("struct_ops/hid_device_event") +int BPF_PROG(hid_event, struct hid_bpf_ctx *hctx) +{ + __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */); + + if (!data) + return 0; /* EPERM check */ + + /* Touch */ + data[1] &= 0xfd; + + /* X */ + data[4] = 0; + data[5] = 0; + + /* Y */ + data[6] = 0; + data[7] = 0; + + return 0; +} + +/* 72 == 360 / 5 -> 1 report every 5 degrees */ +int resolution = 72; +int physical = 5; + +struct haptic_syscall_args { + unsigned int hid; + int retval; +}; + +static __u8 haptic_data[8]; + +SEC("syscall") +int set_haptic(struct haptic_syscall_args *args) +{ + struct hid_bpf_ctx *ctx; + const size_t size = sizeof(haptic_data); + u16 *res; + int ret; + + if (size > sizeof(haptic_data)) + return -7; /* -E2BIG */ + + ctx = hid_bpf_allocate_context(args->hid); + if (!ctx) + return -1; /* EPERM check */ + + haptic_data[0] = 1; /* report ID */ + + ret = hid_bpf_hw_request(ctx, haptic_data, size, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); + + bpf_printk("probed/remove event ret value: %d", ret); + bpf_printk("buf: %02x %02x %02x", + haptic_data[0], + haptic_data[1], + haptic_data[2]); + bpf_printk(" %02x %02x %02x", + haptic_data[3], + haptic_data[4], + haptic_data[5]); + bpf_printk(" %02x %02x", + haptic_data[6], + haptic_data[7]); + + /* whenever resolution multiplier is not 3600, we have the fixed report descriptor */ + res = (u16 *)&haptic_data[1]; + if (*res != 3600) { +// haptic_data[1] = 72; /* resolution multiplier */ +// haptic_data[2] = 0; /* resolution multiplier */ +// haptic_data[3] = 0; /* Repeat Count */ + haptic_data[4] = 3; /* haptic Auto Trigger */ +// haptic_data[5] = 5; /* Waveform Cutoff Time */ +// haptic_data[6] = 80; /* Retrigger Period */ +// haptic_data[7] = 0; /* Retrigger Period */ + } else { + haptic_data[4] = 0; + } + + ret = hid_bpf_hw_request(ctx, haptic_data, size, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); + + bpf_printk("set haptic ret value: %d -> %d", ret, haptic_data[4]); + + args->retval = ret; + + hid_bpf_release_context(ctx); + + return 0; +} + +/* Convert REL_DIAL into REL_WHEEL */ +SEC("struct_ops/hid_rdesc_fixup") +int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hctx) +{ + __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 4096 /* size */); + __u16 *res, *phys; + + if (!data) + return 0; /* EPERM check */ + + /* Convert TOUCH into a button */ + data[31] = HID_UP_BUTTON; + data[33] = 2; + + /* Convert REL_DIAL into REL_WHEEL */ + data[45] = HID_GD_WHEEL; + + /* Change Resolution Multiplier */ + phys = (__u16 *)&data[61]; + *phys = physical; + res = (__u16 *)&data[66]; + *res = resolution; + + /* Convert X,Y from Abs to Rel */ + data[88] = 0x06; + data[98] = 0x06; + + return 0; +} + +SEC(".struct_ops.link") +struct hid_bpf_ops surface_dial = { + .hid_rdesc_fixup = (void *)hid_rdesc_fixup, + .hid_device_event = (void *)hid_event, +}; + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = 1; diff --git a/samples/hid/hid_surface_dial.c b/samples/hid/hid_surface_dial.c new file mode 100644 index 000000000000..9dd363845a85 --- /dev/null +++ b/samples/hid/hid_surface_dial.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Benjamin Tissoires + * + * This program will morph the Microsoft Surface Dial into a mouse, + * and depending on the chosen resolution enable or not the haptic feedback: + * - a resolution (-r) of 3600 will report 3600 "ticks" in one full rotation + * without haptic feedback + * - any other resolution will report N "ticks" in a full rotation with haptic + * feedback + * + * A good default for low resolution haptic scrolling is 72 (1 "tick" every 5 + * degrees), and set to 3600 for smooth scrolling. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <unistd.h> + +#include <linux/bpf.h> +#include <linux/errno.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "hid_surface_dial.skel.h" + +static bool running = true; + +struct haptic_syscall_args { + unsigned int hid; + int retval; +}; + +static void int_exit(int sig) +{ + running = false; + exit(0); +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "%s: %s [OPTIONS] /sys/bus/hid/devices/0BUS:0VID:0PID:00ID\n\n" + " OPTIONS:\n" + " -r N\t set the given resolution to the device (number of ticks per 360°)\n\n", + __func__, prog); + fprintf(stderr, + "This program will morph the Microsoft Surface Dial into a mouse,\n" + "and depending on the chosen resolution enable or not the haptic feedback:\n" + "- a resolution (-r) of 3600 will report 3600 'ticks' in one full rotation\n" + " without haptic feedback\n" + "- any other resolution will report N 'ticks' in a full rotation with haptic\n" + " feedback\n" + "\n" + "A good default for low resolution haptic scrolling is 72 (1 'tick' every 5\n" + "degrees), and set to 3600 for smooth scrolling.\n"); +} + +static int get_hid_id(const char *path) +{ + const char *str_id, *dir; + char uevent[1024]; + int fd; + + memset(uevent, 0, sizeof(uevent)); + snprintf(uevent, sizeof(uevent) - 1, "%s/uevent", path); + + fd = open(uevent, O_RDONLY | O_NONBLOCK); + if (fd < 0) + return -ENOENT; + + close(fd); + + dir = basename((char *)path); + + str_id = dir + sizeof("0003:0001:0A37."); + return (int)strtol(str_id, NULL, 16); +} + +static int set_haptic(struct hid_surface_dial *skel, int hid_id) +{ + struct haptic_syscall_args args = { + .hid = hid_id, + .retval = -1, + }; + int haptic_fd, err; + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattr, + .ctx_in = &args, + .ctx_size_in = sizeof(args), + ); + + haptic_fd = bpf_program__fd(skel->progs.set_haptic); + if (haptic_fd < 0) { + fprintf(stderr, "can't locate haptic prog: %m\n"); + return 1; + } + + err = bpf_prog_test_run_opts(haptic_fd, &tattr); + if (err) { + fprintf(stderr, "can't set haptic configuration to hid device %d: %m (err: %d)\n", + hid_id, err); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + struct hid_surface_dial *skel; + const char *optstr = "r:"; + struct bpf_link *link; + const char *sysfs_path; + int err, opt, hid_id, resolution = 72; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'r': + { + char *endp = NULL; + long l = -1; + + if (optarg) { + l = strtol(optarg, &endp, 10); + if (endp && *endp) + l = -1; + } + + if (l < 0) { + fprintf(stderr, + "invalid r option %s - expecting a number\n", + optarg ? optarg : ""); + exit(EXIT_FAILURE); + }; + + resolution = (int) l; + break; + } + default: + usage(basename(argv[0])); + return 1; + } + } + + if (optind == argc) { + usage(basename(argv[0])); + return 1; + } + + sysfs_path = argv[optind]; + if (!sysfs_path) { + perror("sysfs"); + return 1; + } + + skel = hid_surface_dial__open(); + if (!skel) { + fprintf(stderr, "%s %s:%d", __func__, __FILE__, __LINE__); + return -1; + } + + hid_id = get_hid_id(sysfs_path); + if (hid_id < 0) { + fprintf(stderr, "can not open HID device: %m\n"); + return 1; + } + + skel->struct_ops.surface_dial->hid_id = hid_id; + + err = hid_surface_dial__load(skel); + if (err < 0) { + fprintf(stderr, "can not load HID-BPF program: %m\n"); + return 1; + } + + skel->data->resolution = resolution; + skel->data->physical = (int)(resolution / 72); + + link = bpf_map__attach_struct_ops(skel->maps.surface_dial); + if (!link) { + fprintf(stderr, "can not attach HID-BPF program: %m\n"); + return 1; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + set_haptic(skel, hid_id); + + while (running) + sleep(1); + + hid_surface_dial__destroy(skel); + + return 0; +} diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore index 05e51a685242..5233ab63262e 100644 --- a/samples/hidraw/.gitignore +++ b/samples/hidraw/.gitignore @@ -1 +1,2 @@ -hid-example +# SPDX-License-Identifier: GPL-2.0-only +/hid-example diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile index a9ab96188fbe..594d989e5486 100644 --- a/samples/hidraw/Makefile +++ b/samples/hidraw/Makefile @@ -1,12 +1,4 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += hid-example -# List of programs to build -hostprogs-y := hid-example - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include - -all: hid-example +userccflags += -I usr/include diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c index 92e6c1511910..0f73ace3c6c3 100644 --- a/samples/hidraw/hid-example.c +++ b/samples/hidraw/hid-example.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Hidraw Userspace Example * @@ -118,7 +119,7 @@ int main(int argc, char **argv) if (res < 0) perror("HIDIOCSFEATURE"); else - printf("ioctl HIDIOCGFEATURE returned: %d\n", res); + printf("ioctl HIDIOCSFEATURE returned: %d\n", res); /* Get Feature */ buf[0] = 0x9; /* Report Number */ @@ -127,7 +128,7 @@ int main(int argc, char **argv) perror("HIDIOCGFEATURE"); } else { printf("ioctl HIDIOCGFEATURE returned: %d\n", res); - printf("Report data (not containing the report number):\n\t"); + printf("Report data:\n\t"); for (i = 0; i < res; i++) printf("%hhx ", buf[i]); puts("\n"); diff --git a/samples/hung_task/Makefile b/samples/hung_task/Makefile new file mode 100644 index 000000000000..86036f1a204d --- /dev/null +++ b/samples/hung_task/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_tests.o diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c new file mode 100644 index 000000000000..0360ec916890 --- /dev/null +++ b/samples/hung_task/hung_task_tests.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * hung_task_tests.c - Sample code for testing hung tasks with mutex, + * semaphore, etc. + * + * Usage: Load this module and read `<debugfs>/hung_task/mutex`, + * `<debugfs>/hung_task/semaphore`, `<debugfs>/hung_task/rw_semaphore_read`, + * `<debugfs>/hung_task/rw_semaphore_write`, etc., with 2 or more processes. + * + * This is for testing kernel hung_task error messages with various locking + * mechanisms (e.g., mutex, semaphore, rw_semaphore_read, rw_semaphore_write, etc.). + * Note that this may freeze your system or cause a panic. Use only for testing purposes. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/semaphore.h> +#include <linux/rwsem.h> + +#define HUNG_TASK_DIR "hung_task" +#define HUNG_TASK_MUTEX_FILE "mutex" +#define HUNG_TASK_SEM_FILE "semaphore" +#define HUNG_TASK_RWSEM_READ_FILE "rw_semaphore_read" +#define HUNG_TASK_RWSEM_WRITE_FILE "rw_semaphore_write" +#define SLEEP_SECOND 256 + +static const char dummy_string[] = "This is a dummy string."; +static DEFINE_MUTEX(dummy_mutex); +static DEFINE_SEMAPHORE(dummy_sem, 1); +static DECLARE_RWSEM(dummy_rwsem); +static struct dentry *hung_task_dir; + +/* Mutex-based read function */ +static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Second task waits on mutex, entering uninterruptible sleep */ + guard(mutex)(&dummy_mutex); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Semaphore-based read function */ +static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Second task waits on semaphore, entering uninterruptible sleep */ + down(&dummy_sem); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up(&dummy_sem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Read-write semaphore read function */ +static ssize_t read_dummy_rwsem_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Acquires read lock, allowing concurrent readers but blocks if write lock is held */ + down_read(&dummy_rwsem); + + /* Sleeps here, potentially triggering hung task detection if lock is held too long */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up_read(&dummy_rwsem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Read-write semaphore write function */ +static ssize_t read_dummy_rwsem_write(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Check if data is already read */ + if (*ppos >= sizeof(dummy_string)) + return 0; + + /* Acquires exclusive write lock, blocking all other readers and writers */ + down_write(&dummy_rwsem); + + /* Sleeps here, potentially triggering hung task detection if lock is held too long */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up_write(&dummy_rwsem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* File operations for mutex */ +static const struct file_operations hung_task_mutex_fops = { + .read = read_dummy_mutex, +}; + +/* File operations for semaphore */ +static const struct file_operations hung_task_sem_fops = { + .read = read_dummy_semaphore, +}; + +/* File operations for rw_semaphore read */ +static const struct file_operations hung_task_rwsem_read_fops = { + .read = read_dummy_rwsem_read, +}; + +/* File operations for rw_semaphore write */ +static const struct file_operations hung_task_rwsem_write_fops = { + .read = read_dummy_rwsem_write, +}; + +static int __init hung_task_tests_init(void) +{ + hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); + if (IS_ERR(hung_task_dir)) + return PTR_ERR(hung_task_dir); + + /* Create debugfs files for mutex and semaphore tests */ + debugfs_create_file(HUNG_TASK_MUTEX_FILE, 0400, hung_task_dir, NULL, + &hung_task_mutex_fops); + debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL, + &hung_task_sem_fops); + debugfs_create_file(HUNG_TASK_RWSEM_READ_FILE, 0400, hung_task_dir, NULL, + &hung_task_rwsem_read_fops); + debugfs_create_file(HUNG_TASK_RWSEM_WRITE_FILE, 0400, hung_task_dir, NULL, + &hung_task_rwsem_write_fops); + + return 0; +} + +static void __exit hung_task_tests_exit(void) +{ + debugfs_remove_recursive(hung_task_dir); +} + +module_init(hung_task_tests_init); +module_exit(hung_task_tests_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Masami Hiramatsu <mhiramat@kernel.org>"); +MODULE_AUTHOR("Zi Li <amaindex@outlook.com>"); +MODULE_DESCRIPTION("Simple sleep under lock files for testing hung task"); diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile index 0f5c31c2fc47..ef4b6fdd7d9c 100644 --- a/samples/hw_breakpoint/Makefile +++ b/samples/hw_breakpoint/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index ef7f32291852..fbb03b66dcbd 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * usage: insmod data_breakpoint.ko ksym=<ksym_name> * * This file is a kernel module that places a breakpoint over ksym_name kernel @@ -34,9 +21,9 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> -struct perf_event * __percpu *sample_hbp; +static struct perf_event * __percpu *sample_hbp; -static char ksym_name[KSYM_NAME_LEN] = "pid_max"; +static char ksym_name[KSYM_NAME_LEN] = "jiffies"; module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); @@ -54,15 +41,19 @@ static int __init hw_break_module_init(void) { int ret; struct perf_event_attr attr; + void *addr = __symbol_get(ksym_name); + + if (!addr) + return -ENXIO; hw_breakpoint_init(&attr); - attr.bp_addr = kallsyms_lookup_name(ksym_name); + attr.bp_addr = (unsigned long)addr; attr.bp_len = HW_BREAKPOINT_LEN_4; - attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + attr.bp_type = HW_BREAKPOINT_W; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); - if (IS_ERR((void __force *)sample_hbp)) { - ret = PTR_ERR((void __force *)sample_hbp); + if (IS_ERR_PCPU(sample_hbp)) { + ret = PTR_ERR_PCPU(sample_hbp); goto fail; } @@ -79,6 +70,9 @@ fail: static void __exit hw_break_module_exit(void) { unregister_wide_hw_breakpoint(sample_hbp); +#ifdef CONFIG_MODULE_UNLOAD + __symbol_put(ksym_name); +#endif printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); } diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile index fbedf39d9356..947cb852279c 100644 --- a/samples/kdb/Makefile +++ b/samples/kdb/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c index c1c2fa0f62c2..82736e5a5e32 100644 --- a/samples/kdb/kdb_hello.c +++ b/samples/kdb/kdb_hello.c @@ -28,28 +28,26 @@ static int kdb_hello_cmd(int argc, const char **argv) return 0; } +static kdbtab_t hello_cmd = { + .name = "hello", + .func = kdb_hello_cmd, + .usage = "[string]", + .help = "Say Hello World or Hello [string]", +}; static int __init kdb_hello_cmd_init(void) { /* * Registration of a dynamically added kdb command is done with - * kdb_register() with the arguments being: - * 1: The name of the shell command - * 2: The function that processes the command - * 3: Description of the usage of any arguments - * 4: Descriptive text when you run help - * 5: Number of characters to complete the command - * 0 == type the whole command - * 1 == match both "g" and "go" for example + * kdb_register(). */ - kdb_register("hello", kdb_hello_cmd, "[string]", - "Say Hello World or Hello [string]", 0); + kdb_register(&hello_cmd); return 0; } static void __exit kdb_hello_cmd_exit(void) { - kdb_unregister("hello"); + kdb_unregister(&hello_cmd); } module_init(kdb_hello_cmd_init); diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile index bcc9484a15b2..0af5250ad944 100644 --- a/samples/kfifo/Makefile +++ b/samples/kfifo/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index 2fca916d9edf..4ae29a12cc8a 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -1,10 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Sample kfifo byte stream implementation * * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> - * - * Released under the GPL version 2 only. - * */ #include <linux/init.h> @@ -24,10 +22,10 @@ #define PROC_FIFO "bytestream-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -118,14 +116,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -134,21 +134,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } -static const struct file_operations fifo_fops = { - .owner = THIS_MODULE, - .read = fifo_read, - .write = fifo_write, - .llseek = noop_llseek, +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, }; static int __init example_init(void) @@ -171,7 +172,7 @@ static int __init example_init(void) return -EIO; } - if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) { + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { #ifdef DYNAMIC kfifo_free(&test); #endif @@ -190,5 +191,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample kfifo byte stream implementation"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index be0d4a5fdf53..8076ac410161 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -1,15 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Sample fifo dma implementation * * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> - * - * Released under the GPL version 2 only. - * */ #include <linux/init.h> -#include <linux/module.h> #include <linux/kfifo.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/dma-mapping.h> /* * This module shows how to handle fifo dma operations. @@ -139,5 +139,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample fifo dma implementation"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index 8dc3c2e7105a..e4f93317c5d0 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -1,10 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Sample kfifo int type implementation * * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> - * - * Released under the GPL version 2 only. - * */ #include <linux/init.h> @@ -24,10 +22,10 @@ #define PROC_FIFO "int-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -111,14 +109,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -127,21 +127,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } -static const struct file_operations fifo_fops = { - .owner = THIS_MODULE, - .read = fifo_read, - .write = fifo_write, - .llseek = noop_llseek, +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, }; static int __init example_init(void) @@ -162,7 +163,7 @@ static int __init example_init(void) return -EIO; } - if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) { + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { #ifdef DYNAMIC kfifo_free(&test); #endif @@ -181,5 +182,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample kfifo int type implementation"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c index 2d7529eeb294..e4d1a2d7983c 100644 --- a/samples/kfifo/record-example.c +++ b/samples/kfifo/record-example.c @@ -1,10 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Sample dynamic sized record fifo implementation * * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> - * - * Released under the GPL version 2 only. - * */ #include <linux/init.h> @@ -24,10 +22,10 @@ #define PROC_FIFO "record-fifo" /* lock for procfs read access */ -static DEFINE_MUTEX(read_lock); +static DEFINE_MUTEX(read_access); /* lock for procfs write access */ -static DEFINE_MUTEX(write_lock); +static DEFINE_MUTEX(write_access); /* * define DYNAMIC in this example for a dynamically allocated fifo. @@ -125,14 +123,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&write_lock)) + if (mutex_lock_interruptible(&write_access)) return -ERESTARTSYS; ret = kfifo_from_user(&test, buf, count, &copied); - mutex_unlock(&write_lock); + mutex_unlock(&write_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } static ssize_t fifo_read(struct file *file, char __user *buf, @@ -141,21 +141,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf, int ret; unsigned int copied; - if (mutex_lock_interruptible(&read_lock)) + if (mutex_lock_interruptible(&read_access)) return -ERESTARTSYS; ret = kfifo_to_user(&test, buf, count, &copied); - mutex_unlock(&read_lock); + mutex_unlock(&read_access); + if (ret) + return ret; - return ret ? ret : copied; + return copied; } -static const struct file_operations fifo_fops = { - .owner = THIS_MODULE, - .read = fifo_read, - .write = fifo_write, - .llseek = noop_llseek, +static const struct proc_ops fifo_proc_ops = { + .proc_read = fifo_read, + .proc_write = fifo_write, + .proc_lseek = noop_llseek, }; static int __init example_init(void) @@ -178,7 +179,7 @@ static int __init example_init(void) return -EIO; } - if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) { + if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { #ifdef DYNAMIC kfifo_free(&test); #endif @@ -197,5 +198,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample dynamic sized record fifo implementation"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile new file mode 100644 index 000000000000..8a999ab43b6d --- /dev/null +++ b/samples/kmemleak/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak-test.o diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c new file mode 100644 index 000000000000..8609812a37eb --- /dev/null +++ b/samples/kmemleak/kmemleak-test.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * samples/kmemleak/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas <catalin.marinas@arm.com> + */ + +#define pr_fmt(fmt) "kmemleak: " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/fdtable.h> + +#include <linux/kmemleak.h> + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + pr_info("Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kzalloc(sizeof(*elem)) = 0x%px\n", elem); + if (!elem) + return -ENOMEM; + INIT_LIST_HEAD(&elem->list); + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmalloc(129) = 0x%px\n", + per_cpu(kmemleak_test_pointer, i)); + } + + pr_info("__alloc_percpu(64, 4) = 0x%px\n", __alloc_percpu(64, 4)); + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_DESCRIPTION("Sample module to leak memory for kmemleak testing"); +MODULE_LICENSE("GPL"); diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile index 4a194203c982..bb5d2199742b 100644 --- a/samples/kobject/Makefile +++ b/samples/kobject/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c index 2e0740f06cd7..36d87ca0bee2 100644 --- a/samples/kobject/kobject-example.c +++ b/samples/kobject/kobject-example.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Sample kobject implementation * * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2007 Novell Inc. - * - * Released under the GPL version 2 only. - * */ #include <linux/kobject.h> #include <linux/string.h> @@ -15,7 +13,7 @@ /* * This module shows how to create a simple subdirectory in sysfs called - * /sys/kernel/kobject-example In that directory, 3 files are created: + * /sys/kernel/kobject_example In that directory, 3 files are created: * "foo", "baz", and "bar". If an integer is written to these files, it can be * later read out of it. */ @@ -30,7 +28,7 @@ static int bar; static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", foo); + return sysfs_emit(buf, "%d\n", foo); } static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -62,7 +60,7 @@ static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr, var = baz; else var = bar; - return sprintf(buf, "%d\n", var); + return sysfs_emit(buf, "%d\n", var); } static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -104,7 +102,7 @@ static struct attribute *attrs[] = { * created for the attributes with the directory being the name of the * attribute group. */ -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = attrs, }; @@ -142,5 +140,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample kobject implementation"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c index a55bff52bde3..d0103904e5dd 100644 --- a/samples/kobject/kset-example.c +++ b/samples/kobject/kset-example.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Sample kset and ktype implementation * * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2007 Novell Inc. - * - * Released under the GPL version 2 only. - * */ #include <linux/kobject.h> #include <linux/string.h> @@ -16,8 +14,8 @@ /* * This module shows how to create a kset in sysfs called - * /sys/kernel/kset-example - * Then tree kobjects are created and assigned to this kset, "foo", "baz", + * /sys/kernel/kset_example + * Then three kobjects are created and assigned to this kset, "foo", "baz", * and "bar". In those kobjects, attributes of the same name are also * created and if an integer is written to these files, it can be later * read out of it. @@ -39,10 +37,11 @@ struct foo_obj { /* a custom attribute that works just for a struct foo_obj. */ struct foo_attribute { struct attribute attr; - ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf); - ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count); + ssize_t (*show)(struct foo_obj *foo, const struct foo_attribute *attr, char *buf); + ssize_t (*store)(struct foo_obj *foo, const struct foo_attribute *attr, + const char *buf, size_t count); }; -#define to_foo_attr(x) container_of(x, struct foo_attribute, attr) +#define to_foo_attr(x) container_of_const(x, struct foo_attribute, attr) /* * The default show function that must be passed to sysfs. This will be @@ -55,7 +54,7 @@ static ssize_t foo_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct foo_attribute *attribute; + const struct foo_attribute *attribute; struct foo_obj *foo; attribute = to_foo_attr(attr); @@ -75,7 +74,7 @@ static ssize_t foo_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { - struct foo_attribute *attribute; + const struct foo_attribute *attribute; struct foo_obj *foo; attribute = to_foo_attr(attr); @@ -111,13 +110,13 @@ static void foo_release(struct kobject *kobj) /* * The "foo" file where the .foo variable is read from and written to. */ -static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr, +static ssize_t foo_show(struct foo_obj *foo_obj, const struct foo_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", foo_obj->foo); + return sysfs_emit(buf, "%d\n", foo_obj->foo); } -static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr, +static ssize_t foo_store(struct foo_obj *foo_obj, const struct foo_attribute *attr, const char *buf, size_t count) { int ret; @@ -130,14 +129,14 @@ static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr, } /* Sysfs attributes cannot be world-writable. */ -static struct foo_attribute foo_attribute = +static const struct foo_attribute foo_attribute = __ATTR(foo, 0664, foo_show, foo_store); /* * More complex function where we determine which variable is being accessed by * looking at the attribute for the "baz" and "bar" files. */ -static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr, +static ssize_t b_show(struct foo_obj *foo_obj, const struct foo_attribute *attr, char *buf) { int var; @@ -146,10 +145,10 @@ static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr, var = foo_obj->baz; else var = foo_obj->bar; - return sprintf(buf, "%d\n", var); + return sysfs_emit(buf, "%d\n", var); } -static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr, +static ssize_t b_store(struct foo_obj *foo_obj, const struct foo_attribute *attr, const char *buf, size_t count) { int var, ret; @@ -165,31 +164,47 @@ static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr, return count; } -static struct foo_attribute baz_attribute = +static const struct foo_attribute baz_attribute = __ATTR(baz, 0664, b_show, b_store); -static struct foo_attribute bar_attribute = +static const struct foo_attribute bar_attribute = __ATTR(bar, 0664, b_show, b_store); /* * Create a group of attributes so that we can create and destroy them all * at once. */ -static struct attribute *foo_default_attrs[] = { +static const struct attribute *const foo_default_attrs[] = { &foo_attribute.attr, &baz_attribute.attr, &bar_attribute.attr, NULL, /* need to NULL terminate the list of attributes */ }; +static umode_t foo_default_attrs_is_visible(struct kobject *kobj, + const struct attribute *attr, + int n) +{ + /* Hide attributes with the same name as the kobject. */ + if (strcmp(kobject_name(kobj), attr->name) == 0) + return 0; + return attr->mode; +} + +static const struct attribute_group foo_default_group = { + .attrs_const = foo_default_attrs, + .is_visible_const = foo_default_attrs_is_visible, +}; +__ATTRIBUTE_GROUPS(foo_default); + /* * Our own ktype for our kobjects. Here we specify our sysfs ops, the * release function, and the set of default attributes we want created * whenever a kobject of this type is registered with the kernel. */ -static struct kobj_type foo_ktype = { +static const struct kobj_type foo_ktype = { .sysfs_ops = &foo_sysfs_ops, .release = foo_release, - .default_attrs = foo_default_attrs, + .default_groups = foo_default_groups, }; static struct kset *example_kset; @@ -285,5 +300,6 @@ static void __exit example_exit(void) module_init(example_init); module_exit(example_exit); +MODULE_DESCRIPTION("Sample kset and ktype implementation"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile index 68739bc4fc6a..e774592718d6 100644 --- a/samples/kprobes/Makefile +++ b/samples/kprobes/Makefile @@ -1,5 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only # builds the kprobes example kernel modules; # then to use one (as root): insmod <module_name.ko> -obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o jprobe_example.o +obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c deleted file mode 100644 index e3c0a40909f7..000000000000 --- a/samples/kprobes/jprobe_example.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Here's a sample kernel module showing the use of jprobes to dump - * the arguments of _do_fork(). - * - * For more information on theory of operation of jprobes, see - * Documentation/kprobes.txt - * - * Build and insert the kernel module as done in the kprobe example. - * You will see the trace data in /var/log/messages and on the - * console whenever _do_fork() is invoked to create a new process. - * (Some messages may be suppressed if syslogd is configured to - * eliminate duplicate messages.) - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/kprobes.h> - -/* - * Jumper probe for _do_fork. - * Mirror principle enables access to arguments of the probed routine - * from the probe handler. - */ - -/* Proxy routine having the same arguments as actual _do_fork() routine */ -static long j_do_fork(unsigned long clone_flags, unsigned long stack_start, - unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr, unsigned long tls) -{ - pr_info("jprobe: clone_flags = 0x%lx, stack_start = 0x%lx " - "stack_size = 0x%lx\n", clone_flags, stack_start, stack_size); - - /* Always end with a call to jprobe_return(). */ - jprobe_return(); - return 0; -} - -static struct jprobe my_jprobe = { - .entry = j_do_fork, - .kp = { - .symbol_name = "_do_fork", - }, -}; - -static int __init jprobe_init(void) -{ - int ret; - - ret = register_jprobe(&my_jprobe); - if (ret < 0) { - pr_err("register_jprobe failed, returned %d\n", ret); - return -1; - } - pr_info("Planted jprobe at %p, handler addr %p\n", - my_jprobe.kp.addr, my_jprobe.entry); - return 0; -} - -static void __exit jprobe_exit(void) -{ - unregister_jprobe(&my_jprobe); - pr_info("jprobe at %p unregistered\n", my_jprobe.kp.addr); -} - -module_init(jprobe_init) -module_exit(jprobe_exit) -MODULE_LICENSE("GPL"); diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 88b3e2d227ae..53ec6c8b8c40 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -1,22 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * NOTE: This example is works on x86 and powerpc. * Here's a sample kernel module showing the use of kprobes to dump a - * stack trace and selected registers when _do_fork() is called. + * stack trace and selected registers when kernel_clone() is called. * * For more information on theory of operation of kprobes, see - * Documentation/kprobes.txt + * Documentation/trace/kprobes.rst * * You will see the trace data in /var/log/messages and on the console - * whenever _do_fork() is invoked to create a new process. + * whenever kernel_clone() is invoked to create a new process. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/module.h> #include <linux/kprobes.h> -#define MAX_SYMBOL_LEN 64 -static char symbol[MAX_SYMBOL_LEN] = "_do_fork"; -module_param_string(symbol, symbol, sizeof(symbol), 0644); +static char symbol[KSYM_NAME_LEN] = "kernel_clone"; +module_param_string(symbol, symbol, KSYM_NAME_LEN, 0644); /* For each probe you need to allocate a kprobe structure */ static struct kprobe kp = { @@ -24,70 +25,81 @@ static struct kprobe kp = { }; /* kprobe pre_handler: called just before the probed instruction is executed */ -static int handler_pre(struct kprobe *p, struct pt_regs *regs) +static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) { #ifdef CONFIG_X86 - pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", p->symbol_name, p->addr, regs->ip, regs->flags); #endif #ifdef CONFIG_PPC - pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", p->symbol_name, p->addr, regs->nip, regs->msr); #endif #ifdef CONFIG_MIPS - pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status); #endif -#ifdef CONFIG_TILEGX - pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, ex1 = 0x%lx\n", - p->symbol_name, p->addr, regs->pc, regs->ex1); -#endif #ifdef CONFIG_ARM64 - pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx," - " pstate = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); #endif +#ifdef CONFIG_ARM + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr); +#endif +#ifdef CONFIG_RISCV + pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n", + p->symbol_name, p->addr, regs->epc, regs->status); +#endif +#ifdef CONFIG_S390 + pr_info("<%s> p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->psw.addr, regs->flags); +#endif +#ifdef CONFIG_LOONGARCH + pr_info("<%s> p->addr = 0x%p, era = 0x%lx, estat = 0x%lx\n", + p->symbol_name, p->addr, regs->csr_era, regs->csr_estat); +#endif /* A dump_stack() here will give a stack backtrace */ return 0; } /* kprobe post_handler: called after the probed instruction is executed */ -static void handler_post(struct kprobe *p, struct pt_regs *regs, +static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { #ifdef CONFIG_X86 - pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, flags = 0x%lx\n", p->symbol_name, p->addr, regs->flags); #endif #ifdef CONFIG_PPC - pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, msr = 0x%lx\n", p->symbol_name, p->addr, regs->msr); #endif #ifdef CONFIG_MIPS - pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n", p->symbol_name, p->addr, regs->cp0_status); #endif -#ifdef CONFIG_TILEGX - pr_info("<%s> post_handler: p->addr = 0x%p, ex1 = 0x%lx\n", - p->symbol_name, p->addr, regs->ex1); -#endif #ifdef CONFIG_ARM64 - pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", + pr_info("<%s> p->addr = 0x%p, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pstate); #endif -} - -/* - * fault_handler: this is called if an exception is generated for any - * instruction within the pre- or post-handler, or when Kprobes - * single-steps the probed instruction. - */ -static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) -{ - pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); - /* Return 0 because we don't handle the fault. */ - return 0; +#ifdef CONFIG_ARM + pr_info("<%s> p->addr = 0x%p, cpsr = 0x%lx\n", + p->symbol_name, p->addr, (long)regs->ARM_cpsr); +#endif +#ifdef CONFIG_RISCV + pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n", + p->symbol_name, p->addr, regs->status); +#endif +#ifdef CONFIG_S390 + pr_info("<%s> p->addr, 0x%p, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->flags); +#endif +#ifdef CONFIG_LOONGARCH + pr_info("<%s> p->addr = 0x%p, estat = 0x%lx\n", + p->symbol_name, p->addr, regs->csr_estat); +#endif } static int __init kprobe_init(void) @@ -95,7 +107,6 @@ static int __init kprobe_init(void) int ret; kp.pre_handler = handler_pre; kp.post_handler = handler_post; - kp.fault_handler = handler_fault; ret = register_kprobe(&kp); if (ret < 0) { @@ -114,4 +125,5 @@ static void __exit kprobe_exit(void) module_init(kprobe_init) module_exit(kprobe_exit) +MODULE_DESCRIPTION("sample kernel module showing the use of kprobes"); MODULE_LICENSE("GPL"); diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 7f9060f435cd..65d6dcafd742 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kretprobe_example.c * @@ -7,10 +8,10 @@ * * usage: insmod kretprobe_example.ko func=<func_name> * - * If no func_name is specified, _do_fork is instrumented + * If no func_name is specified, kernel_clone is instrumented * * For more information on theory of operation of kretprobes, see - * Documentation/kprobes.txt + * Documentation/trace/kprobes.rst * * Build and insert the kernel module as done in the kprobe example. * You will see the trace data in /var/log/messages and on the console @@ -22,11 +23,10 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/ktime.h> -#include <linux/limits.h> #include <linux/sched.h> -static char func_name[NAME_MAX] = "_do_fork"; -module_param_string(func, func_name, NAME_MAX, S_IRUGO); +static char func_name[KSYM_NAME_LEN] = "kernel_clone"; +module_param_string(func, func_name, KSYM_NAME_LEN, 0644); MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" " function's execution time"); @@ -35,7 +35,7 @@ struct my_data { ktime_t entry_stamp; }; -/* Here we use the entry_hanlder to timestamp function entry */ +/* Here we use the entry_handler to timestamp function entry */ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { struct my_data *data; @@ -47,6 +47,7 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) data->entry_stamp = ktime_get(); return 0; } +NOKPROBE_SYMBOL(entry_handler); /* * Return-probe handler: Log the return value and duration. Duration may turn @@ -66,6 +67,7 @@ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) func_name, retval, (long long)delta); return 0; } +NOKPROBE_SYMBOL(ret_handler); static struct kretprobe my_kretprobe = { .handler = ret_handler, @@ -83,7 +85,7 @@ static int __init kretprobe_init(void) ret = register_kretprobe(&my_kretprobe); if (ret < 0) { pr_err("register_kretprobe failed, returned %d\n", ret); - return -1; + return ret; } pr_info("Planted return probe at %s: %p\n", my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); @@ -102,4 +104,5 @@ static void __exit kretprobe_exit(void) module_init(kretprobe_init) module_exit(kretprobe_exit) +MODULE_DESCRIPTION("sample kernel module showing the use of return probes"); MODULE_LICENSE("GPL"); diff --git a/samples/landlock/.gitignore b/samples/landlock/.gitignore new file mode 100644 index 000000000000..f43668b2d318 --- /dev/null +++ b/samples/landlock/.gitignore @@ -0,0 +1 @@ +/sandboxer diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile new file mode 100644 index 000000000000..5d601e51c2eb --- /dev/null +++ b/samples/landlock/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: BSD-3-Clause + +userprogs-always-y := sandboxer + +userccflags += -I usr/include + +.PHONY: all clean + +all: + $(MAKE) -C ../.. samples/landlock/ + +clean: + $(MAKE) -C ../.. M=samples/landlock/ clean diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c new file mode 100644 index 000000000000..e7af02f98208 --- /dev/null +++ b/samples/landlock/sandboxer.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* + * Simple Landlock sandbox manager able to execute a process restricted by + * user-defined file system and network access control policies. + * + * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> + * Copyright © 2020 ANSSI + */ + +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <arpa/inet.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/landlock.h> +#include <linux/socket.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <stdbool.h> + +#if defined(__GLIBC__) +#include <linux/prctl.h> +#endif + +#ifndef landlock_create_ruleset +static inline int +landlock_create_ruleset(const struct landlock_ruleset_attr *const attr, + const size_t size, const __u32 flags) +{ + return syscall(__NR_landlock_create_ruleset, attr, size, flags); +} +#endif + +#ifndef landlock_add_rule +static inline int landlock_add_rule(const int ruleset_fd, + const enum landlock_rule_type rule_type, + const void *const rule_attr, + const __u32 flags) +{ + return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr, + flags); +} +#endif + +#ifndef landlock_restrict_self +static inline int landlock_restrict_self(const int ruleset_fd, + const __u32 flags) +{ + return syscall(__NR_landlock_restrict_self, ruleset_fd, flags); +} +#endif + +#define ENV_FS_RO_NAME "LL_FS_RO" +#define ENV_FS_RW_NAME "LL_FS_RW" +#define ENV_TCP_BIND_NAME "LL_TCP_BIND" +#define ENV_TCP_CONNECT_NAME "LL_TCP_CONNECT" +#define ENV_SCOPED_NAME "LL_SCOPED" +#define ENV_FORCE_LOG_NAME "LL_FORCE_LOG" +#define ENV_DELIMITER ":" + +static int str2num(const char *numstr, __u64 *num_dst) +{ + char *endptr = NULL; + int err = 0; + __u64 num; + + errno = 0; + num = strtoull(numstr, &endptr, 10); + if (errno != 0) + err = errno; + /* Was the string empty, or not entirely parsed successfully? */ + else if ((*numstr == '\0') || (*endptr != '\0')) + err = EINVAL; + else + *num_dst = num; + + return err; +} + +static int parse_path(char *env_path, const char ***const path_list) +{ + int i, num_paths = 0; + + if (env_path) { + num_paths++; + for (i = 0; env_path[i]; i++) { + if (env_path[i] == ENV_DELIMITER[0]) + num_paths++; + } + } + *path_list = malloc(num_paths * sizeof(**path_list)); + if (!*path_list) + return -1; + + for (i = 0; i < num_paths; i++) + (*path_list)[i] = strsep(&env_path, ENV_DELIMITER); + + return num_paths; +} + +/* clang-format off */ + +#define ACCESS_FILE ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) + +/* clang-format on */ + +static int populate_ruleset_fs(const char *const env_var, const int ruleset_fd, + const __u64 allowed_access) +{ + int num_paths, i, ret = 1; + char *env_path_name; + const char **path_list = NULL; + struct landlock_path_beneath_attr path_beneath = { + .parent_fd = -1, + }; + + env_path_name = getenv(env_var); + if (!env_path_name) { + /* Prevents users to forget a setting. */ + fprintf(stderr, "Missing environment variable %s\n", env_var); + return 1; + } + env_path_name = strdup(env_path_name); + unsetenv(env_var); + num_paths = parse_path(env_path_name, &path_list); + if (num_paths < 0) { + fprintf(stderr, "Failed to allocate memory\n"); + goto out_free_name; + } + if (num_paths == 1 && path_list[0][0] == '\0') { + /* + * Allows to not use all possible restrictions (e.g. use + * LL_FS_RO without LL_FS_RW). + */ + ret = 0; + goto out_free_name; + } + + for (i = 0; i < num_paths; i++) { + struct stat statbuf; + + path_beneath.parent_fd = open(path_list[i], O_PATH | O_CLOEXEC); + if (path_beneath.parent_fd < 0) { + fprintf(stderr, "Failed to open \"%s\": %s\n", + path_list[i], strerror(errno)); + continue; + } + if (fstat(path_beneath.parent_fd, &statbuf)) { + fprintf(stderr, "Failed to stat \"%s\": %s\n", + path_list[i], strerror(errno)); + close(path_beneath.parent_fd); + goto out_free_name; + } + path_beneath.allowed_access = allowed_access; + if (!S_ISDIR(statbuf.st_mode)) + path_beneath.allowed_access &= ACCESS_FILE; + if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)) { + fprintf(stderr, + "Failed to update the ruleset with \"%s\": %s\n", + path_list[i], strerror(errno)); + close(path_beneath.parent_fd); + goto out_free_name; + } + close(path_beneath.parent_fd); + } + ret = 0; + +out_free_name: + free(path_list); + free(env_path_name); + return ret; +} + +static int populate_ruleset_net(const char *const env_var, const int ruleset_fd, + const __u64 allowed_access) +{ + int ret = 1; + char *env_port_name, *env_port_name_next, *strport; + struct landlock_net_port_attr net_port = { + .allowed_access = allowed_access, + }; + + env_port_name = getenv(env_var); + if (!env_port_name) + return 0; + env_port_name = strdup(env_port_name); + unsetenv(env_var); + + env_port_name_next = env_port_name; + while ((strport = strsep(&env_port_name_next, ENV_DELIMITER))) { + __u64 port; + + if (strcmp(strport, "") == 0) + continue; + + if (str2num(strport, &port)) { + fprintf(stderr, "Failed to parse port at \"%s\"\n", + strport); + goto out_free_name; + } + net_port.port = port; + if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &net_port, 0)) { + fprintf(stderr, + "Failed to update the ruleset with port \"%llu\": %s\n", + net_port.port, strerror(errno)); + goto out_free_name; + } + } + ret = 0; + +out_free_name: + free(env_port_name); + return ret; +} + +/* Returns true on error, false otherwise. */ +static bool check_ruleset_scope(const char *const env_var, + struct landlock_ruleset_attr *ruleset_attr) +{ + char *env_type_scope, *env_type_scope_next, *ipc_scoping_name; + bool error = false; + bool abstract_scoping = false; + bool signal_scoping = false; + + /* Scoping is not supported by Landlock ABI */ + if (!(ruleset_attr->scoped & + (LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL))) + goto out_unset; + + env_type_scope = getenv(env_var); + /* Scoping is not supported by the user */ + if (!env_type_scope || strcmp("", env_type_scope) == 0) + goto out_unset; + + env_type_scope = strdup(env_type_scope); + env_type_scope_next = env_type_scope; + while ((ipc_scoping_name = + strsep(&env_type_scope_next, ENV_DELIMITER))) { + if (strcmp("a", ipc_scoping_name) == 0 && !abstract_scoping) { + abstract_scoping = true; + } else if (strcmp("s", ipc_scoping_name) == 0 && + !signal_scoping) { + signal_scoping = true; + } else { + fprintf(stderr, "Unknown or duplicate scope \"%s\"\n", + ipc_scoping_name); + error = true; + goto out_free_name; + } + } + +out_free_name: + free(env_type_scope); + +out_unset: + if (!abstract_scoping) + ruleset_attr->scoped &= ~LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET; + if (!signal_scoping) + ruleset_attr->scoped &= ~LANDLOCK_SCOPE_SIGNAL; + + unsetenv(env_var); + return error; +} + +/* clang-format off */ + +#define ACCESS_FS_ROUGHLY_READ ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_READ_DIR) + +#define ACCESS_FS_ROUGHLY_WRITE ( \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_REMOVE_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_FILE | \ + LANDLOCK_ACCESS_FS_MAKE_CHAR | \ + LANDLOCK_ACCESS_FS_MAKE_DIR | \ + LANDLOCK_ACCESS_FS_MAKE_REG | \ + LANDLOCK_ACCESS_FS_MAKE_SOCK | \ + LANDLOCK_ACCESS_FS_MAKE_FIFO | \ + LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ + LANDLOCK_ACCESS_FS_MAKE_SYM | \ + LANDLOCK_ACCESS_FS_REFER | \ + LANDLOCK_ACCESS_FS_TRUNCATE | \ + LANDLOCK_ACCESS_FS_IOCTL_DEV) + +/* clang-format on */ + +#define LANDLOCK_ABI_LAST 7 + +#define XSTR(s) #s +#define STR(s) XSTR(s) + +/* clang-format off */ + +static const char help[] = + "usage: " ENV_FS_RO_NAME "=\"...\" " ENV_FS_RW_NAME "=\"...\" " + "[other environment variables] %1$s <cmd> [args]...\n" + "\n" + "Execute the given command in a restricted environment.\n" + "Multi-valued settings (lists of ports, paths, scopes) are colon-delimited.\n" + "\n" + "Mandatory settings:\n" + "* " ENV_FS_RO_NAME ": paths allowed to be used in a read-only way\n" + "* " ENV_FS_RW_NAME ": paths allowed to be used in a read-write way\n" + "\n" + "Optional settings (when not set, their associated access check " + "is always allowed, which is different from an empty string which " + "means an empty list):\n" + "* " ENV_TCP_BIND_NAME ": ports allowed to bind (server)\n" + "* " ENV_TCP_CONNECT_NAME ": ports allowed to connect (client)\n" + "* " ENV_SCOPED_NAME ": actions denied on the outside of the landlock domain\n" + " - \"a\" to restrict opening abstract unix sockets\n" + " - \"s\" to restrict sending signals\n" + "\n" + "A sandboxer should not log denied access requests to avoid spamming logs, " + "but to test audit we can set " ENV_FORCE_LOG_NAME "=1\n" + "\n" + "Example:\n" + ENV_FS_RO_NAME "=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" " + ENV_FS_RW_NAME "=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" " + ENV_TCP_BIND_NAME "=\"9418\" " + ENV_TCP_CONNECT_NAME "=\"80:443\" " + ENV_SCOPED_NAME "=\"a:s\" " + "%1$s bash -i\n" + "\n" + "This sandboxer can use Landlock features up to ABI version " + STR(LANDLOCK_ABI_LAST) ".\n"; + +/* clang-format on */ + +int main(const int argc, char *const argv[], char *const *const envp) +{ + const char *cmd_path; + char *const *cmd_argv; + int ruleset_fd, abi; + char *env_port_name, *env_force_log; + __u64 access_fs_ro = ACCESS_FS_ROUGHLY_READ, + access_fs_rw = ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE; + + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = access_fs_rw, + .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP, + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | + LANDLOCK_SCOPE_SIGNAL, + }; + int supported_restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; + int set_restrict_flags = 0; + + if (argc < 2) { + fprintf(stderr, help, argv[0]); + return 1; + } + + abi = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION); + if (abi < 0) { + const int err = errno; + + perror("Failed to check Landlock compatibility"); + switch (err) { + case ENOSYS: + fprintf(stderr, + "Hint: Landlock is not supported by the current kernel. " + "To support it, build the kernel with " + "CONFIG_SECURITY_LANDLOCK=y and prepend " + "\"landlock,\" to the content of CONFIG_LSM.\n"); + break; + case EOPNOTSUPP: + fprintf(stderr, + "Hint: Landlock is currently disabled. " + "It can be enabled in the kernel configuration by " + "prepending \"landlock,\" to the content of CONFIG_LSM, " + "or at boot time by setting the same content to the " + "\"lsm\" kernel parameter.\n"); + break; + } + return 1; + } + + /* Best-effort security. */ + switch (abi) { + case 1: + /* + * Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2 + * + * Note: The "refer" operations (file renaming and linking + * across different directories) are always forbidden when using + * Landlock with ABI 1. + * + * If only ABI 1 is available, this sandboxer knowingly forbids + * refer operations. + * + * If a program *needs* to do refer operations after enabling + * Landlock, it can not use Landlock at ABI level 1. To be + * compatible with different kernel versions, such programs + * should then fall back to not restrict themselves at all if + * the running kernel only supports ABI 1. + */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER; + __attribute__((fallthrough)); + case 2: + /* Removes LANDLOCK_ACCESS_FS_TRUNCATE for ABI < 3 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_TRUNCATE; + __attribute__((fallthrough)); + case 3: + /* Removes network support for ABI < 4 */ + ruleset_attr.handled_access_net &= + ~(LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP); + __attribute__((fallthrough)); + case 4: + /* Removes LANDLOCK_ACCESS_FS_IOCTL_DEV for ABI < 5 */ + ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL_DEV; + + __attribute__((fallthrough)); + case 5: + /* Removes LANDLOCK_SCOPE_* for ABI < 6 */ + ruleset_attr.scoped &= ~(LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | + LANDLOCK_SCOPE_SIGNAL); + __attribute__((fallthrough)); + case 6: + /* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */ + supported_restrict_flags &= + ~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; + + /* Must be printed for any ABI < LANDLOCK_ABI_LAST. */ + fprintf(stderr, + "Hint: You should update the running kernel " + "to leverage Landlock features " + "provided by ABI version %d (instead of %d).\n", + LANDLOCK_ABI_LAST, abi); + __attribute__((fallthrough)); + case LANDLOCK_ABI_LAST: + break; + default: + fprintf(stderr, + "Hint: You should update this sandboxer " + "to leverage Landlock features " + "provided by ABI version %d (instead of %d).\n", + abi, LANDLOCK_ABI_LAST); + } + access_fs_ro &= ruleset_attr.handled_access_fs; + access_fs_rw &= ruleset_attr.handled_access_fs; + + /* Removes bind access attribute if not supported by a user. */ + env_port_name = getenv(ENV_TCP_BIND_NAME); + if (!env_port_name) { + ruleset_attr.handled_access_net &= + ~LANDLOCK_ACCESS_NET_BIND_TCP; + } + /* Removes connect access attribute if not supported by a user. */ + env_port_name = getenv(ENV_TCP_CONNECT_NAME); + if (!env_port_name) { + ruleset_attr.handled_access_net &= + ~LANDLOCK_ACCESS_NET_CONNECT_TCP; + } + + if (check_ruleset_scope(ENV_SCOPED_NAME, &ruleset_attr)) + return 1; + + /* Enables optional logs. */ + env_force_log = getenv(ENV_FORCE_LOG_NAME); + if (env_force_log) { + if (strcmp(env_force_log, "1") != 0) { + fprintf(stderr, "Unknown value for " ENV_FORCE_LOG_NAME + " (only \"1\" is handled)\n"); + return 1; + } + if (!(supported_restrict_flags & + LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON)) { + fprintf(stderr, + "Audit logs not supported by current kernel\n"); + return 1; + } + set_restrict_flags |= LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; + unsetenv(ENV_FORCE_LOG_NAME); + } + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + if (ruleset_fd < 0) { + perror("Failed to create a ruleset"); + return 1; + } + + if (populate_ruleset_fs(ENV_FS_RO_NAME, ruleset_fd, access_fs_ro)) { + goto err_close_ruleset; + } + if (populate_ruleset_fs(ENV_FS_RW_NAME, ruleset_fd, access_fs_rw)) { + goto err_close_ruleset; + } + + if (populate_ruleset_net(ENV_TCP_BIND_NAME, ruleset_fd, + LANDLOCK_ACCESS_NET_BIND_TCP)) { + goto err_close_ruleset; + } + if (populate_ruleset_net(ENV_TCP_CONNECT_NAME, ruleset_fd, + LANDLOCK_ACCESS_NET_CONNECT_TCP)) { + goto err_close_ruleset; + } + + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + perror("Failed to restrict privileges"); + goto err_close_ruleset; + } + if (landlock_restrict_self(ruleset_fd, set_restrict_flags)) { + perror("Failed to enforce ruleset"); + goto err_close_ruleset; + } + close(ruleset_fd); + + cmd_path = argv[1]; + cmd_argv = argv + 1; + fprintf(stderr, "Executing the sandboxed command...\n"); + execvpe(cmd_path, cmd_argv, envp); + fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path, + strerror(errno)); + fprintf(stderr, "Hint: access to the binary, the interpreter or " + "shared libraries may be denied.\n"); + return 1; + +err_close_ruleset: + close(ruleset_fd); + return 1; +} diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile index 10319d7ea0b1..9f853eeb6140 100644 --- a/samples/livepatch/Makefile +++ b/samples/livepatch/Makefile @@ -1 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c new file mode 100644 index 000000000000..fadc2a85cb35 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-busymod.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/delay.h> + +static int sleep_secs; +module_param(sleep_secs, int, 0644); +MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)"); + +static void busymod_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(work, busymod_work_func); + +static void busymod_work_func(struct work_struct *work) +{ + pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs); + msleep(sleep_secs * 1000); + pr_info("%s exit\n", __func__); +} + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + schedule_delayed_work(&work, 0); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + cancel_delayed_work_sync(&work); + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module"); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c new file mode 100644 index 000000000000..9e69d9caed25 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-demo.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo + * + * + * Purpose + * ------- + * + * Demonstration of registering livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * Step 1 - load the simple module + * + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * + * + * Step 2 - load the demonstration livepatch (with callbacks) + * + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * + * Step 3 - cleanup + * + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod livepatch_callbacks_demo + * rmmod livepatch_callbacks_mod + * + * Watch dmesg output to see livepatch enablement, callback execution + * and patching operations for both vmlinux and module targets. + * + * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and + * livepatch-callbacks-demo.ko to observe what happens when a + * target module is loaded after a livepatch with callbacks. + * + * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch + * callback return status. Try setting up a non-zero status + * such as -19 (-ENODEV): + * + * # Load demo livepatch, vmlinux is patched + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # Setup next pre-patch callback to return -ENODEV + * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret + * + * # Module loader refuses to load the target module + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device + * + * NOTE: There is a second target module, + * livepatch-callbacks-busymod.ko, available for experimenting + * with livepatch (un)patch callbacks. This module contains + * a 'sleep_secs' parameter that parks the module on one of the + * functions that the livepatch demo module wants to patch. + * Modifying this value and tweaking the order of module loads can + * effectively demonstrate stalled patch transitions: + * + * # Load a target module, let it park on 'busymod_work_func' for + * # thirty seconds + * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30 + * + * # Meanwhile load the livepatch + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # ... then load and unload another target module while the + * # transition is in progress + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * rmmod samples/livepatch/livepatch-callbacks-mod.ko + * + * # Finally cleanup + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod samples/livepatch/livepatch-callbacks-demo.ko + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> + +static int pre_patch_ret; +module_param(pre_patch_ret, int, 0644); +MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)"); + +static const char *const module_state[] = { + [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state", + [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init", + [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away", + [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up", +}; + +static void callback_info(const char *callback, struct klp_object *obj) +{ + if (obj->mod) + pr_info("%s: %s -> %s\n", callback, obj->mod->name, + module_state[obj->mod->state]); + else + pr_info("%s: vmlinux\n", callback); +} + +/* Executed on object patching (ie, patch enablement) */ +static int pre_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); + return pre_patch_ret; +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void pre_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +static void patched_work_func(struct work_struct *work) +{ + pr_info("%s\n", __func__); +} + +static struct klp_func no_funcs[] = { + { } +}; + +static struct klp_func busymod_funcs[] = { + { + .old_name = "busymod_work_func", + .new_func = patched_work_func, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = NULL, /* vmlinux */ + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_mod", + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_busymod", + .funcs = busymod_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_callbacks_demo_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_callbacks_demo_exit(void) +{ +} + +module_init(livepatch_callbacks_demo_init); +module_exit(livepatch_callbacks_demo_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks"); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c new file mode 100644 index 000000000000..d1851b471ad9 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-mod.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-callbacks-mod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-demo.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module"); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c index 84795223f15f..5263a2f31c48 100644 --- a/samples/livepatch/livepatch-sample.c +++ b/samples/livepatch/livepatch-sample.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * livepatch-sample.c - Kernel Live Patching Sample Module * * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -69,40 +57,15 @@ static struct klp_patch patch = { static int livepatch_init(void) { - int ret; - - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - - ret = klp_register_patch(&patch); - if (ret) - return ret; - ret = klp_enable_patch(&patch); - if (ret) { - WARN_ON(klp_unregister_patch(&patch)); - return ret; - } - return 0; + return klp_enable_patch(&patch); } static void livepatch_exit(void) { - WARN_ON(klp_unregister_patch(&patch)); } module_init(livepatch_init); module_exit(livepatch_exit); +MODULE_DESCRIPTION("Kernel Live Patching Sample Module"); MODULE_LICENSE("GPL"); MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c new file mode 100644 index 000000000000..cbf68ca40097 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-fix1.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Fixes the memory leak introduced in livepatch-shadow-mod through the + * use of a shadow variable. This fix demonstrates the "extending" of + * short-lived data structures by patching its allocation and release + * functions. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +/* + * The constructor makes more sense together with klp_shadow_get_or_alloc(). + * In this example, it would be safe to assign the pointer also to the shadow + * variable returned by klp_shadow_alloc(). But we wanted to show the more + * complicated use of the API. + */ +static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data) +{ + int **shadow_leak = shadow_data; + int **leak = ctor_data; + + if (!ctor_data) + return -EINVAL; + + *shadow_leak = *leak; + return 0; +} + +static struct dummy *livepatch_fix1_dummy_alloc(void) +{ + struct dummy *d; + int *leak; + int **shadow_leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD); + + /* + * Patch: save the extra memory location into a SV_LEAK shadow + * variable. A patched dummy_free routine can later fetch this + * pointer to handle resource release. + */ + leak = kzalloc(sizeof(*leak), GFP_KERNEL); + if (!leak) + goto err_leak; + + shadow_leak = klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL, + shadow_leak_ctor, &leak); + if (!shadow_leak) { + pr_err("%s: failed to allocate shadow variable for the leaking pointer: dummy @ %p, leak @ %p\n", + __func__, d, leak); + goto err_shadow; + } + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; + +err_shadow: + kfree(leak); +err_leak: + kfree(d); + return NULL; +} + +static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data) +{ + void *d = obj; + int **shadow_leak = shadow_data; + + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, *shadow_leak); + kfree(*shadow_leak); +} + +static void livepatch_fix1_dummy_free(struct dummy *d) +{ + int **shadow_leak; + + /* + * Patch: fetch the saved SV_LEAK shadow variable, detach and + * free it. Note: handle cases where this shadow variable does + * not exist (ie, dummy structures allocated before this livepatch + * was loaded.) + */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) + klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor); + else + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_alloc", + .new_func = livepatch_fix1_dummy_alloc, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix1_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix1_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_shadow_fix1_exit(void) +{ + /* Cleanup any existing SV_LEAK shadow variables */ + klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor); +} + +module_init(livepatch_shadow_fix1_init); +module_exit(livepatch_shadow_fix1_exit); +MODULE_DESCRIPTION("Live patching demo for shadow variables"); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c new file mode 100644 index 000000000000..b99122cb221f --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-fix2.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Adds functionality to livepatch-shadow-mod's in-flight data + * structures through a shadow variable. The livepatch patches a + * routine that periodically inspects data structures, incrementing a + * per-data-structure counter, creating the counter if needed. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 +#define SV_COUNTER 2 + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +static bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies) +{ + int *shadow_count; + + /* + * Patch: handle in-flight dummy structures, if they do not + * already have a SV_COUNTER shadow variable, then attach a + * new one. + */ + shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER, + sizeof(*shadow_count), GFP_NOWAIT, + NULL, NULL); + if (shadow_count) + *shadow_count += 1; + + return time_after(jiffies, d->jiffies_expire); +} + +static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data) +{ + void *d = obj; + int **shadow_leak = shadow_data; + + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, *shadow_leak); + kfree(*shadow_leak); +} + +static void livepatch_fix2_dummy_free(struct dummy *d) +{ + int **shadow_leak; + int *shadow_count; + + /* Patch: copy the memory leak patch from the fix1 module. */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) + klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor); + else + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + + /* + * Patch: fetch the SV_COUNTER shadow variable and display + * the final count. Detach the shadow variable. + */ + shadow_count = klp_shadow_get(d, SV_COUNTER); + if (shadow_count) { + pr_info("%s: dummy @ %p, check counter = %d\n", + __func__, d, *shadow_count); + klp_shadow_free(d, SV_COUNTER, NULL); + } + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_check", + .new_func = livepatch_fix2_dummy_check, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix2_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix2_init(void) +{ + return klp_enable_patch(&patch); +} + +static void livepatch_shadow_fix2_exit(void) +{ + /* Cleanup any existing SV_COUNTER shadow variables */ + klp_shadow_free_all(SV_COUNTER, NULL); +} + +module_init(livepatch_shadow_fix2_init); +module_exit(livepatch_shadow_fix2_exit); +MODULE_DESCRIPTION("Live patching demo for shadow variables"); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c new file mode 100644 index 000000000000..5d83ad5a8118 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-mod.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + */ + +/* + * livepatch-shadow-mod.c - Shadow variables, buggy module demo + * + * Purpose + * ------- + * + * As a demonstration of livepatch shadow variable API, this module + * introduces memory leak behavior that livepatch modules + * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and + * enhance. + * + * WARNING - even though the livepatch-shadow-fix modules patch the + * memory leak, please load these modules at your own risk -- some + * amount of memory may leaked before the bug is patched. + * + * + * Usage + * ----- + * + * Step 1 - Load the buggy demonstration module: + * + * insmod samples/livepatch/livepatch-shadow-mod.ko + * + * Watch dmesg output for a few moments to see new dummy being allocated + * and a periodic cleanup check. (Note: a small amount of memory is + * being leaked.) + * + * + * Step 2 - Load livepatch fix1: + * + * insmod samples/livepatch/livepatch-shadow-fix1.ko + * + * Continue watching dmesg and note that now livepatch_fix1_dummy_free() + * and livepatch_fix1_dummy_alloc() are logging messages about leaked + * memory and eventually leaks prevented. + * + * + * Step 3 - Load livepatch fix2 (on top of fix1): + * + * insmod samples/livepatch/livepatch-shadow-fix2.ko + * + * This module extends functionality through shadow variables, as a new + * "check" counter is added to the dummy structure. Periodic dmesg + * messages will log these as dummies are cleaned up. + * + * + * Step 4 - Cleanup + * + * Unwind the demonstration by disabling the livepatch fix modules, then + * removing them and the demo module: + * + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled + * rmmod livepatch-shadow-fix2 + * rmmod livepatch-shadow-fix1 + * rmmod livepatch-shadow-mod + */ + + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/workqueue.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>"); +MODULE_DESCRIPTION("Buggy module for shadow variable demo"); + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +/* + * Keep a list of all the dummies so we can clean up any residual ones + * on module exit + */ +static LIST_HEAD(dummy_list); +static DEFINE_MUTEX(dummy_list_mutex); + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +static __used noinline struct dummy *dummy_alloc(void) +{ + struct dummy *d; + int *leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD); + + /* Oops, forgot to save leak! */ + leak = kzalloc(sizeof(*leak), GFP_KERNEL); + if (!leak) { + kfree(d); + return NULL; + } + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; +} + +static __used noinline void dummy_free(struct dummy *d) +{ + pr_info("%s: dummy @ %p, expired = %lx\n", + __func__, d, d->jiffies_expire); + + kfree(d); +} + +static __used noinline bool dummy_check(struct dummy *d, + unsigned long jiffies) +{ + return time_after(jiffies, d->jiffies_expire); +} + +/* + * alloc_work_func: allocates new dummy structures, allocates additional + * memory, aptly named "leak", but doesn't keep + * permanent record of it. + */ + +static void alloc_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func); + +static void alloc_work_func(struct work_struct *work) +{ + struct dummy *d; + + d = dummy_alloc(); + if (!d) + return; + + mutex_lock(&dummy_list_mutex); + list_add(&d->list, &dummy_list); + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD)); +} + +/* + * cleanup_work_func: frees dummy structures. Without knownledge of + * "leak", it leaks the additional memory that + * alloc_work_func created. + */ + +static void cleanup_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func); + +static void cleanup_work_func(struct work_struct *work) +{ + struct dummy *d, *tmp; + unsigned long j; + + j = jiffies; + pr_info("%s: jiffies = %lx\n", __func__, j); + + mutex_lock(&dummy_list_mutex); + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + + /* Kick out and free any expired dummies */ + if (dummy_check(d, j)) { + list_del(&d->list); + dummy_free(d); + } + } + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD)); +} + +static int livepatch_shadow_mod_init(void) +{ + schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD)); + schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD)); + + return 0; +} + +static void livepatch_shadow_mod_exit(void) +{ + struct dummy *d, *tmp; + + /* Wait for any dummies at work */ + cancel_delayed_work_sync(&alloc_dwork); + cancel_delayed_work_sync(&cleanup_dwork); + + /* Cleanup residual dummies */ + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + list_del(&d->list); + dummy_free(d); + } +} + +module_init(livepatch_shadow_mod_init); +module_exit(livepatch_shadow_mod_exit); diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore index f356b81ca1ec..fe894bcb6a62 100644 --- a/samples/mei/.gitignore +++ b/samples/mei/.gitignore @@ -1 +1,2 @@ -mei-amt-version +# SPDX-License-Identifier: GPL-2.0-only +/mei-amt-version diff --git a/samples/mei/Makefile b/samples/mei/Makefile index 7aac216dc420..c54b8a0ab04e 100644 --- a/samples/mei/Makefile +++ b/samples/mei/Makefile @@ -1,9 +1,5 @@ -CC := $(CROSS_COMPILE)gcc -CFLAGS := -I../../usr/include +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2012-2019, Intel Corporation. All rights reserved. +userprogs-always-y += mei-amt-version -PROGS := mei-amt-version - -all: $(PROGS) - -clean: - rm -fr $(PROGS) +userccflags += -I usr/include diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c index 57d0d871dcf7..1d7254bcb44c 100644 --- a/samples/mei/mei-amt-version.c +++ b/samples/mei/mei-amt-version.c @@ -69,11 +69,11 @@ #include <string.h> #include <fcntl.h> #include <sys/ioctl.h> +#include <sys/time.h> #include <unistd.h> #include <errno.h> #include <stdint.h> #include <stdbool.h> -#include <bits/wordsize.h> #include <linux/mei.h> /***************************************************************************** @@ -117,7 +117,7 @@ static bool mei_init(struct mei *me, const uuid_le *guid, me->verbose = verbose; - me->fd = open("/dev/mei", O_RDWR); + me->fd = open("/dev/mei0", O_RDWR); if (me->fd == -1) { mei_err(me, "Cannot establish a handle to the Intel MEI driver\n"); goto err; @@ -154,31 +154,52 @@ err: static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer, ssize_t len, unsigned long timeout) { + struct timeval tv; + fd_set set; ssize_t rc; + tv.tv_sec = timeout / 1000; + tv.tv_usec = (timeout % 1000) * 1000000; + mei_msg(me, "call read length = %zd\n", len); + FD_ZERO(&set); + FD_SET(me->fd, &set); + rc = select(me->fd + 1, &set, NULL, NULL, &tv); + if (rc > 0 && FD_ISSET(me->fd, &set)) { + mei_msg(me, "have reply\n"); + } else if (rc == 0) { + rc = -1; + mei_err(me, "read failed on timeout\n"); + goto out; + } else { /* rc < 0 */ + rc = errno; + mei_err(me, "read failed on select with status %zd %s\n", + rc, strerror(errno)); + goto out; + } + rc = read(me->fd, buffer, len); if (rc < 0) { mei_err(me, "read failed with status %zd %s\n", rc, strerror(errno)); - mei_deinit(me); - } else { - mei_msg(me, "read succeeded with result %zd\n", rc); + goto out; } + + mei_msg(me, "read succeeded with result %zd\n", rc); + +out: + if (rc < 0) + mei_deinit(me); + return rc; } static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, ssize_t len, unsigned long timeout) { - struct timeval tv; ssize_t written; ssize_t rc; - fd_set set; - - tv.tv_sec = timeout / 1000; - tv.tv_usec = (timeout % 1000) * 1000000; mei_msg(me, "call write length = %zd\n", len); @@ -189,19 +210,7 @@ static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, written, strerror(errno)); goto out; } - - FD_ZERO(&set); - FD_SET(me->fd, &set); - rc = select(me->fd + 1 , &set, NULL, NULL, &tv); - if (rc > 0 && FD_ISSET(me->fd, &set)) { - mei_msg(me, "write success\n"); - } else if (rc == 0) { - mei_err(me, "write failed on timeout with status\n"); - goto out; - } else { /* rc < 0 */ - mei_err(me, "write failed on select with status %zd\n", rc); - goto out; - } + mei_msg(me, "write success\n"); rc = written; out: @@ -267,7 +276,7 @@ struct amt_host_if_msg_header { struct amt_host_if_resp_header { struct amt_host_if_msg_header header; uint32_t status; - unsigned char data[0]; + unsigned char data[]; } __attribute__((packed)); const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \ @@ -370,7 +379,7 @@ static uint32_t amt_host_if_call(struct amt_host_if *acmd, unsigned int expected_sz) { uint32_t in_buf_sz; - uint32_t out_buf_sz; + ssize_t out_buf_sz; ssize_t written; uint32_t status; struct amt_host_if_resp_header *msg_hdr; diff --git a/samples/mic/mpssd/.gitignore b/samples/mic/mpssd/.gitignore deleted file mode 100644 index 8b7c72f07c92..000000000000 --- a/samples/mic/mpssd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -mpssd diff --git a/samples/mic/mpssd/Makefile b/samples/mic/mpssd/Makefile deleted file mode 100644 index 3e3ef91fed6b..000000000000 --- a/samples/mic/mpssd/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) - -ifeq ($(ARCH),x86) - -PROGS := mpssd -CC = $(CROSS_COMPILE)gcc -CFLAGS := -I../../../usr/include -I../../../tools/include - -ifdef DEBUG -CFLAGS += -DDEBUG=$(DEBUG) -endif - -all: $(PROGS) -mpssd: mpssd.c sysfs.c - $(CC) $(CFLAGS) mpssd.c sysfs.c -o mpssd -lpthread - -install: - install mpssd /usr/sbin/mpssd - install micctrl /usr/sbin/micctrl - -clean: - rm -fr $(PROGS) - -endif -endif diff --git a/samples/mic/mpssd/micctrl b/samples/mic/mpssd/micctrl deleted file mode 100755 index 8f2629b41c5f..000000000000 --- a/samples/mic/mpssd/micctrl +++ /dev/null @@ -1,173 +0,0 @@ -#!/bin/bash -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License, version 2, as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# The full GNU General Public License is included in this distribution in -# the file called "COPYING". -# -# Intel MIC User Space Tools. -# -# micctrl - Controls MIC boot/start/stop. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: micctrl -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -sysfs="/sys/class/mic" - -_status() -{ - f=$sysfs/$1 - echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`" -} - -status() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _status $1 - return $? - fi - for f in $sysfs/* - do - _status `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_reset() -{ - f=$sysfs/$1 - echo reset > $f/state -} - -reset() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _reset $1 - return $? - fi - for f in $sysfs/* - do - _reset `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_boot() -{ - f=$sysfs/$1 - echo "linux" > $f/bootmode - echo "mic/uos.img" > $f/firmware - echo "mic/$1.image" > $f/ramdisk - echo "boot" > $f/state -} - -boot() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _boot $1 - return $? - fi - for f in $sysfs/* - do - _boot `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_shutdown() -{ - f=$sysfs/$1 - echo shutdown > $f/state -} - -shutdown() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _shutdown $1 - return $? - fi - for f in $sysfs/* - do - _shutdown `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_wait() -{ - f=$sysfs/$1 - while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ] - do - sleep 1 - echo -e "Waiting for $1 to go offline" - done -} - -wait() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _wait $1 - return $? - fi - # Wait for the cards to go offline - for f in $sysfs/* - do - _wait `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -if [ ! -d "$sysfs" ]; then - echo -e $"Module unloaded " - exit 3 -fi - -case $1 in - -s) - status $2 - ;; - -r) - reset $2 - ;; - -b) - boot $2 - ;; - -S) - shutdown $2 - ;; - -w) - wait $2 - ;; - *) - echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpss b/samples/mic/mpssd/mpss deleted file mode 100755 index 5fcf9fa4b082..000000000000 --- a/samples/mic/mpssd/mpss +++ /dev/null @@ -1,200 +0,0 @@ -#!/bin/bash -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License, version 2, as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# The full GNU General Public License is included in this distribution in -# the file called "COPYING". -# -# Intel MIC User Space Tools. -# -# mpss Start mpssd. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: mpss -# Required-Start: -# Required-Stop: -# Short-Description: MPSS stack control -# Description: MPSS stack control -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -exec=/usr/sbin/mpssd -sysfs="/sys/class/mic" -mic_modules="mic_host mic_x100_dma scif vop" - -start() -{ - [ -x $exec ] || exit 5 - - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then - echo -e $"MPSSD already running! " - success - echo - return 0 - fi - - echo -e $"Starting MPSS Stack" - echo -e $"Loading MIC drivers:" $mic_modules - - modprobe -a $mic_modules - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - - # Start the daemon - echo -n $"Starting MPSSD " - $exec - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - success - echo - - sleep 5 - - # Boot the cards - micctrl -b - - # Wait till ping works - for f in $sysfs/* - do - count=100 - ipaddr=`cat $f/cmdline` - ipaddr=${ipaddr#*address,} - ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1` - while [ $count -ge 0 ] - do - echo -e "Pinging "`basename $f`" " - ping -c 1 $ipaddr &> /dev/null - RETVAL=$? - if [ $RETVAL -eq 0 ]; then - success - break - fi - sleep 1 - count=`expr $count - 1` - done - [ $RETVAL -ne 0 ] && failure || success - echo - done - return $RETVAL -} - -stop() -{ - echo -e $"Shutting down MPSS Stack: " - - # Bail out if module is unloaded - if [ ! -d "$sysfs" ]; then - echo -n $"Module unloaded " - success - echo - return 0 - fi - - # Shut down the cards. - micctrl -S - - # Wait for the cards to go offline - for f in $sysfs/* - do - while [ "`cat $f/state`" != "ready" ] - do - sleep 1 - echo -e "Waiting for "`basename $f`" to become ready" - done - done - - # Display the status of the cards - micctrl -s - - # Kill MPSSD now - echo -n $"Killing MPSSD" - killall -9 mpssd 2>/dev/null - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -restart() -{ - stop - sleep 5 - start -} - -status() -{ - micctrl -s - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then - echo "mpssd is running" - else - echo "mpssd is stopped" - fi - return 0 -} - -unload() -{ - if [ ! -d "$sysfs" ]; then - echo -n $"No MIC_HOST Module: " - success - echo - return - fi - - stop - - sleep 5 - echo -n $"Removing MIC drivers:" $mic_modules - modprobe -r $mic_modules - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -case $1 in - start) - start - ;; - stop) - stop - ;; - restart) - restart - ;; - status) - status - ;; - unload) - unload - ;; - *) - echo $"Usage: $0 {start|stop|restart|status|unload}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c deleted file mode 100644 index 49db1def1721..000000000000 --- a/samples/mic/mpssd/mpssd.c +++ /dev/null @@ -1,1826 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC User Space Tools. - */ - -#define _GNU_SOURCE - -#include <stdlib.h> -#include <fcntl.h> -#include <getopt.h> -#include <assert.h> -#include <unistd.h> -#include <stdbool.h> -#include <signal.h> -#include <poll.h> -#include <features.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <sys/socket.h> -#include <linux/virtio_ring.h> -#include <linux/virtio_net.h> -#include <linux/virtio_console.h> -#include <linux/virtio_blk.h> -#include <linux/version.h> -#include "mpssd.h" -#include <linux/mic_ioctl.h> -#include <linux/mic_common.h> -#include <tools/endian.h> - -static void *init_mic(void *arg); - -static FILE *logfp; -static struct mic_info mic_list; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -/* align addr on a size boundary - adjust address up/down if needed */ -#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1))) -#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size) - -/* align addr on a size boundary - adjust address up if needed */ -#define _ALIGN(addr, size) _ALIGN_UP(addr, size) - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -#define GSO_ENABLED 1 -#define MAX_GSO_SIZE (64 * 1024) -#define ETH_H_LEN 14 -#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64)) -#define MIC_DEVICE_PAGE_END 0x1000 - -#ifndef VIRTIO_NET_HDR_F_DATA_VALID -#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ -#endif - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_console_config cons_config; -} virtcons_dev_page = { - .dd = { - .type = VIRTIO_ID_CONSOLE, - .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig), - .feature_len = sizeof(virtcons_dev_page.host_features), - .config_len = sizeof(virtcons_dev_page.cons_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -}; - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_net_config net_config; -} virtnet_dev_page = { - .dd = { - .type = VIRTIO_ID_NET, - .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig), - .feature_len = sizeof(virtnet_dev_page.host_features), - .config_len = sizeof(virtnet_dev_page.net_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -#if GSO_ENABLED - .host_features = htole32( - 1 << VIRTIO_NET_F_CSUM | - 1 << VIRTIO_NET_F_GSO | - 1 << VIRTIO_NET_F_GUEST_TSO4 | - 1 << VIRTIO_NET_F_GUEST_TSO6 | - 1 << VIRTIO_NET_F_GUEST_ECN), -#else - .host_features = 0, -#endif -}; - -static const char *mic_config_dir = "/etc/mpss"; -static const char *virtblk_backend = "VIRTBLK_BACKEND"; -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[1]; - __u32 host_features, guest_acknowledgements; - struct virtio_blk_config blk_config; -} virtblk_dev_page = { - .dd = { - .type = VIRTIO_ID_BLOCK, - .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig), - .feature_len = sizeof(virtblk_dev_page.host_features), - .config_len = sizeof(virtblk_dev_page.blk_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .host_features = - htole32(1<<VIRTIO_BLK_F_SEG_MAX), - .blk_config = { - .seg_max = htole32(MIC_VRING_ENTRIES - 2), - .capacity = htole64(0), - } -}; - -static char *myname; - -static int -tap_configure(struct mic_info *mic, char *dev) -{ - pid_t pid; - char *ifargv[7]; - char ipaddr[IFNAMSIZ]; - int ret = 0; - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "link"; - ifargv[2] = "set"; - ifargv[3] = dev; - ifargv[4] = "up"; - ifargv[5] = NULL; - mpsslog("Configuring %s\n", dev); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id + 1); - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "addr"; - ifargv[2] = "add"; - ifargv[3] = ipaddr; - ifargv[4] = "dev"; - ifargv[5] = dev; - ifargv[6] = NULL; - mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - mpsslog("MIC name %s %s %d DONE!\n", - mic->name, __func__, __LINE__); - return 0; -} - -static int tun_alloc(struct mic_info *mic, char *dev) -{ - struct ifreq ifr; - int fd, err; -#if GSO_ENABLED - unsigned offload; -#endif - fd = open("/dev/net/tun", O_RDWR); - if (fd < 0) { - mpsslog("Could not open /dev/net/tun %s\n", strerror(errno)); - goto done; - } - - memset(&ifr, 0, sizeof(ifr)); - - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - if (*dev) - strncpy(ifr.ifr_name, dev, IFNAMSIZ); - - err = ioctl(fd, TUNSETIFF, (void *)&ifr); - if (err < 0) { - mpsslog("%s %s %d TUNSETIFF failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#if GSO_ENABLED - offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN; - - err = ioctl(fd, TUNSETOFFLOAD, offload); - if (err < 0) { - mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#endif - strcpy(dev, ifr.ifr_name); - mpsslog("Created TAP %s\n", dev); -done: - return fd; -} - -#define NET_FD_VIRTIO_NET 0 -#define NET_FD_TUN 1 -#define MAX_NET_FD 2 - -static void set_dp(struct mic_info *mic, int type, void *dp) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - mic->mic_console.console_dp = dp; - return; - case VIRTIO_ID_NET: - mic->mic_net.net_dp = dp; - return; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.block_dp = dp; - return; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); -} - -static void *get_dp(struct mic_info *mic, int type) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - return mic->mic_console.console_dp; - case VIRTIO_ID_NET: - return mic->mic_net.net_dp; - case VIRTIO_ID_BLOCK: - return mic->mic_virtblk.block_dp; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); - return NULL; -} - -static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type) -{ - struct mic_device_desc *d; - int i; - void *dp = get_dp(mic, type); - - for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE; - i += mic_total_desc_size(d)) { - d = dp + i; - - /* End of list */ - if (d->type == 0) - break; - - if (d->type == -1) - continue; - - mpsslog("%s %s d-> type %d d %p\n", - mic->name, __func__, d->type, d); - - if (d->type == (__u8)type) - return d; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - return NULL; -} - -/* See comments in vhost.c for explanation of next_desc() */ -static unsigned next_desc(struct vring_desc *desc) -{ - unsigned int next; - - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) - return -1U; - next = le16toh(desc->next); - return next; -} - -/* Sum up all the IOVEC length */ -static ssize_t -sum_iovec_len(struct mic_copy_desc *copy) -{ - ssize_t sum = 0; - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - sum += copy->iov[i].iov_len; - return sum; -} - -static inline void verify_out_len(struct mic_info *mic, - struct mic_copy_desc *copy) -{ - if (copy->out_len != sum_iovec_len(copy)) { - mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%zx\n", - mic->name, __func__, __LINE__, - copy->out_len, sum_iovec_len(copy)); - assert(copy->out_len == sum_iovec_len(copy)); - } -} - -/* Display an iovec */ -static void -disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, - const char *s, int line) -{ - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%zx\n", - mic->name, s, line, i, - copy->iov[i].iov_base, copy->iov[i].iov_len); -} - -static inline __u16 read_avail_idx(struct mic_vring *vr) -{ - return ACCESS_ONCE(vr->info->avail_idx); -} - -static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, - struct mic_copy_desc *copy, ssize_t len) -{ - copy->vr_idx = tx ? 0 : 1; - copy->update_used = true; - if (type == VIRTIO_ID_NET) - copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr); - else - copy->iov[0].iov_len = len; -} - -/* Central API which triggers the copies */ -static int -mic_virtio_copy(struct mic_info *mic, int fd, - struct mic_vring *vr, struct mic_copy_desc *copy) -{ - int ret; - - ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy); - if (ret) { - mpsslog("%s %s %d errno %s ret %d\n", - mic->name, __func__, __LINE__, - strerror(errno), ret); - } - return ret; -} - -static inline unsigned _vring_size(unsigned int num, unsigned long align) -{ - return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) - + align - 1) & ~(align - 1)) - + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num; -} - -/* - * This initialization routine requires at least one - * vring i.e. vr0. vr1 is optional. - */ -static void * -init_vr(struct mic_info *mic, int fd, int type, - struct mic_vring *vr0, struct mic_vring *vr1, int num_vq) -{ - int vr_size; - char *va; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq, - PROT_READ, MAP_SHARED, fd, 0); - if (MAP_FAILED == va) { - mpsslog("%s %s %d mmap failed errno %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - goto done; - } - set_dp(mic, type, va); - vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END]; - vr0->info = vr0->va + - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN); - vring_init(&vr0->vr, - MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr0->va, vr0->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr0->info->magic), MIC_MAGIC + type); - assert(le32toh(vr0->info->magic) == MIC_MAGIC + type); - if (vr1) { - vr1->va = (struct mic_vring *) - &va[MIC_DEVICE_PAGE_END + vr_size]; - vr1->info = vr1->va + _vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN); - vring_init(&vr1->vr, - MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr1->va, vr1->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr1->info->magic), MIC_MAGIC + type + 1); - assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1); - } -done: - return va; -} - -static int -wait_for_card_driver(struct mic_info *mic, int fd, int type) -{ - struct pollfd pollfd; - int err; - struct mic_device_desc *desc = get_device_desc(mic, type); - __u8 prev_status; - - if (!desc) - return -ENODEV; - prev_status = desc->status; - pollfd.fd = fd; - mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n", - mic->name, __func__, type, desc->status); - - while (1) { - pollfd.events = POLLIN; - pollfd.revents = 0; - err = poll(&pollfd, 1, -1); - if (err < 0) { - mpsslog("%s %s poll failed %s\n", - mic->name, __func__, strerror(errno)); - continue; - } - - if (pollfd.revents) { - if (desc->status != prev_status) { - mpsslog("%s %s Waiting... desc-> type %d " - "status 0x%x\n", - mic->name, __func__, type, - desc->status); - prev_status = desc->status; - } - if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - mpsslog("%s %s poll.revents %d\n", - mic->name, __func__, pollfd.revents); - mpsslog("%s %s desc-> type %d status 0x%x\n", - mic->name, __func__, type, - desc->status); - break; - } - } - } - return 0; -} - -/* Spin till we have some descriptors */ -static void -spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) -{ - __u16 avail_idx = read_avail_idx(vr); - - while (avail_idx == le16toh(ACCESS_ONCE(vr->vr.avail->idx))) { -#ifdef DEBUG - mpsslog("%s %s waiting for desc avail %d info_avail %d\n", - mic->name, __func__, - le16toh(vr->vr.avail->idx), vr->info->avail_idx); -#endif - sched_yield(); - } -} - -static void * -virtio_net(void *arg) -{ - static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)]; - static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64))); - struct iovec vnet_iov[2][2] = { - { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) }, - { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } }, - { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) }, - { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } }, - }; - struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - char if_name[IFNAMSIZ]; - struct pollfd net_poll[MAX_NET_FD]; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - int err; - - snprintf(if_name, IFNAMSIZ, "mic%d", mic->id); - mic->mic_net.tap_fd = tun_alloc(mic, if_name); - if (mic->mic_net.tap_fd < 0) - goto done; - - if (tap_configure(mic, if_name)) - goto done; - mpsslog("MIC name %s id %d\n", mic->name, mic->id); - - net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd; - net_poll[NET_FD_VIRTIO_NET].events = POLLIN; - net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd; - net_poll[NET_FD_TUN].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET, &tx_vr, &rx_vr, - virtnet_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto done; - } - - copy.iovcnt = 2; - desc = get_device_desc(mic, VIRTIO_ID_NET); - - while (1) { - ssize_t len; - - net_poll[NET_FD_VIRTIO_NET].revents = 0; - net_poll[NET_FD_TUN].revents = 0; - - /* Start polling for data from tap and virtio net */ - err = poll(net_poll, 2, -1); - if (err < 0) { - mpsslog("%s poll failed %s\n", - __func__, strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - /* - * Check if there is data to be read from TUN and write to - * virtio net fd if there is. - */ - if (net_poll[NET_FD_TUN].revents & POLLIN) { - copy.iov = iov0; - len = readv(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len > 0) { - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *)vnet_hdr[0]; - - /* Disable checksums on the card since we are on - a reliable PCIe link */ - hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID; -#ifdef DEBUG - mpsslog("%s %s %d hdr->flags 0x%x ", mic->name, - __func__, __LINE__, hdr->flags); - mpsslog("copy.out_len %d hdr->gso_type 0x%x\n", - copy.out_len, hdr->gso_type); -#endif -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, ©, - len); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &tx_vr, - ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0[1].iov_len = MAX_NET_PKT_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", mic->name, - __func__, __LINE__, strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - /* - * Check if there is data to be read from virtio net and - * write to TUN if there is. - */ - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, ©, - MAX_NET_PKT_SIZE - + sizeof(struct virtio_net_hdr)); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &rx_vr, - ©); - if (!err) { -#ifdef DEBUG - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *) - vnet_hdr[1]; - - mpsslog("%s %s %d hdr->flags 0x%x, ", - mic->name, __func__, __LINE__, - hdr->flags); - mpsslog("out_len %d gso_type 0x%x\n", - copy.out_len, - hdr->gso_type); -#endif - /* Set the correct output iov_len */ - iov1[1].iov_len = copy.out_len - - sizeof(struct virtio_net_hdr); - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(copy)); -#endif - len = writev(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -done: - pthread_exit(NULL); -} - -/* virtio_console */ -#define VIRTIO_CONSOLE_FD 0 -#define MONITOR_FD (VIRTIO_CONSOLE_FD + 1) -#define MAX_CONSOLE_FD (MONITOR_FD + 1) /* must be the last one + 1 */ -#define MAX_BUFFER_SIZE PAGE_SIZE - -static void * -virtio_console(void *arg) -{ - static __u8 vcons_buf[2][PAGE_SIZE]; - struct iovec vcons_iov[2] = { - { .iov_base = vcons_buf[0], .iov_len = sizeof(vcons_buf[0]) }, - { .iov_base = vcons_buf[1], .iov_len = sizeof(vcons_buf[1]) }, - }; - struct iovec *iov0 = &vcons_iov[0], *iov1 = &vcons_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - int err; - struct pollfd console_poll[MAX_CONSOLE_FD]; - int pty_fd; - char *pts_name; - ssize_t len; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - - pty_fd = posix_openpt(O_RDWR); - if (pty_fd < 0) { - mpsslog("can't open a pseudoterminal master device: %s\n", - strerror(errno)); - goto _return; - } - pts_name = ptsname(pty_fd); - if (pts_name == NULL) { - mpsslog("can't get pts name\n"); - goto _close_pty; - } - printf("%s console message goes to %s\n", mic->name, pts_name); - mpsslog("%s console message goes to %s\n", mic->name, pts_name); - err = grantpt(pty_fd); - if (err < 0) { - mpsslog("can't grant access: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - err = unlockpt(pty_fd); - if (err < 0) { - mpsslog("can't unlock a pseudoterminal: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - console_poll[MONITOR_FD].fd = pty_fd; - console_poll[MONITOR_FD].events = POLLIN; - - console_poll[VIRTIO_CONSOLE_FD].fd = mic->mic_console.virtio_console_fd; - console_poll[VIRTIO_CONSOLE_FD].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE, &tx_vr, &rx_vr, - virtcons_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto _close_pty; - } - - copy.iovcnt = 1; - desc = get_device_desc(mic, VIRTIO_ID_CONSOLE); - - for (;;) { - console_poll[MONITOR_FD].revents = 0; - console_poll[VIRTIO_CONSOLE_FD].revents = 0; - err = poll(console_poll, MAX_CONSOLE_FD, -1); - if (err < 0) { - mpsslog("%s %d: poll failed: %s\n", __func__, __LINE__, - strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - - if (console_poll[MONITOR_FD].revents & POLLIN) { - copy.iov = iov0; - len = readv(pty_fd, copy.iov, copy.iovcnt); - if (len > 0) { -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_CONSOLE, 1, &tx_vr, - ©, len); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &tx_vr, ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(copy)); -#endif - /* Reinitialize IOV for next run */ - iov0->iov_len = PAGE_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", - mic->name, __func__, __LINE__, - strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - if (console_poll[VIRTIO_CONSOLE_FD].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_CONSOLE, 0, &rx_vr, - ©, PAGE_SIZE); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &rx_vr, ©); - if (!err) { - /* Set the correct output iov_len */ - iov1->iov_len = copy.out_len; - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(copy)); -#endif - len = writev(pty_fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, copy, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (console_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -_close_pty: - close(pty_fd); -_return: - pthread_exit(NULL); -} - -static void -add_virtio_device(struct mic_info *mic, struct mic_device_desc *dd) -{ - char path[PATH_MAX]; - int fd, err; - - snprintf(path, PATH_MAX, "/dev/vop_virtio%d", mic->id); - fd = open(path, O_RDWR); - if (fd < 0) { - mpsslog("Could not open %s %s\n", path, strerror(errno)); - return; - } - - err = ioctl(fd, MIC_VIRTIO_ADD_DEVICE, dd); - if (err < 0) { - mpsslog("Could not add %d %s\n", dd->type, strerror(errno)); - close(fd); - return; - } - switch (dd->type) { - case VIRTIO_ID_NET: - mic->mic_net.virtio_net_fd = fd; - mpsslog("Added VIRTIO_ID_NET for %s\n", mic->name); - break; - case VIRTIO_ID_CONSOLE: - mic->mic_console.virtio_console_fd = fd; - mpsslog("Added VIRTIO_ID_CONSOLE for %s\n", mic->name); - break; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.virtio_block_fd = fd; - mpsslog("Added VIRTIO_ID_BLOCK for %s\n", mic->name); - break; - } -} - -static bool -set_backend_file(struct mic_info *mic) -{ - FILE *config; - char buff[PATH_MAX], *line, *evv, *p; - - snprintf(buff, PATH_MAX, "%s/mpssd%03d.conf", mic_config_dir, mic->id); - config = fopen(buff, "r"); - if (config == NULL) - return false; - do { /* look for "virtblk_backend=XXXX" */ - line = fgets(buff, PATH_MAX, config); - if (line == NULL) - break; - if (*line == '#') - continue; - p = strchr(line, '\n'); - if (p) - *p = '\0'; - } while (strncmp(line, virtblk_backend, strlen(virtblk_backend)) != 0); - fclose(config); - if (line == NULL) - return false; - evv = strchr(line, '='); - if (evv == NULL) - return false; - mic->mic_virtblk.backend_file = malloc(strlen(evv) + 1); - if (mic->mic_virtblk.backend_file == NULL) { - mpsslog("%s %d can't allocate memory\n", mic->name, mic->id); - return false; - } - strcpy(mic->mic_virtblk.backend_file, evv + 1); - return true; -} - -#define SECTOR_SIZE 512 -static bool -set_backend_size(struct mic_info *mic) -{ - mic->mic_virtblk.backend_size = lseek(mic->mic_virtblk.backend, 0, - SEEK_END); - if (mic->mic_virtblk.backend_size < 0) { - mpsslog("%s: can't seek: %s\n", - mic->name, mic->mic_virtblk.backend_file); - return false; - } - virtblk_dev_page.blk_config.capacity = - mic->mic_virtblk.backend_size / SECTOR_SIZE; - if ((mic->mic_virtblk.backend_size % SECTOR_SIZE) != 0) - virtblk_dev_page.blk_config.capacity++; - - virtblk_dev_page.blk_config.capacity = - htole64(virtblk_dev_page.blk_config.capacity); - - return true; -} - -static bool -open_backend(struct mic_info *mic) -{ - if (!set_backend_file(mic)) - goto _error_exit; - mic->mic_virtblk.backend = open(mic->mic_virtblk.backend_file, O_RDWR); - if (mic->mic_virtblk.backend < 0) { - mpsslog("%s: can't open: %s\n", mic->name, - mic->mic_virtblk.backend_file); - goto _error_free; - } - if (!set_backend_size(mic)) - goto _error_close; - mic->mic_virtblk.backend_addr = mmap(NULL, - mic->mic_virtblk.backend_size, - PROT_READ|PROT_WRITE, MAP_SHARED, - mic->mic_virtblk.backend, 0L); - if (mic->mic_virtblk.backend_addr == MAP_FAILED) { - mpsslog("%s: can't map: %s %s\n", - mic->name, mic->mic_virtblk.backend_file, - strerror(errno)); - goto _error_close; - } - return true; - - _error_close: - close(mic->mic_virtblk.backend); - _error_free: - free(mic->mic_virtblk.backend_file); - _error_exit: - return false; -} - -static void -close_backend(struct mic_info *mic) -{ - munmap(mic->mic_virtblk.backend_addr, mic->mic_virtblk.backend_size); - close(mic->mic_virtblk.backend); - free(mic->mic_virtblk.backend_file); -} - -static bool -start_virtblk(struct mic_info *mic, struct mic_vring *vring) -{ - if (((unsigned long)&virtblk_dev_page.blk_config % 8) != 0) { - mpsslog("%s: blk_config is not 8 byte aligned.\n", - mic->name); - return false; - } - add_virtio_device(mic, &virtblk_dev_page.dd); - if (MAP_FAILED == init_vr(mic, mic->mic_virtblk.virtio_block_fd, - VIRTIO_ID_BLOCK, vring, NULL, - virtblk_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - return false; - } - return true; -} - -static void -stop_virtblk(struct mic_info *mic) -{ - int vr_size, ret; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - ret = munmap(mic->mic_virtblk.block_dp, - MIC_DEVICE_PAGE_END + vr_size * virtblk_dev_page.dd.num_vq); - if (ret < 0) - mpsslog("%s munmap errno %d\n", mic->name, errno); - close(mic->mic_virtblk.virtio_block_fd); -} - -static __u8 -header_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(struct virtio_blk_outhdr)) { - mpsslog("%s() %d: length is not sizeof(virtio_blk_outhd)\n", - __func__, __LINE__); - return -EIO; - } - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) { - mpsslog("%s() %d: alone\n", - __func__, __LINE__); - return -EIO; - } - if (le16toh(desc->flags) & VRING_DESC_F_WRITE) { - mpsslog("%s() %d: not read\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -read_header(int fd, struct virtio_blk_outhdr *hdr, __u32 desc_idx) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_len = sizeof(*hdr); - iovec.iov_base = hdr; - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static int -transfer_blocks(int fd, struct iovec *iovec, __u32 iovcnt) -{ - struct mic_copy_desc copy; - - copy.iov = iovec; - copy.iovcnt = iovcnt; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static __u8 -status_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(__u8)) { - mpsslog("%s() %d: length is not sizeof(status)\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -write_status(int fd, __u8 *status) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_base = status; - iovec.iov_len = sizeof(*status); - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = true; /* Update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -#ifndef VIRTIO_BLK_T_GET_ID -#define VIRTIO_BLK_T_GET_ID 8 -#endif - -static void * -virtio_block(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int ret; - struct pollfd block_poll; - struct mic_vring vring; - __u16 avail_idx; - __u32 desc_idx; - struct vring_desc *desc; - struct iovec *iovec, *piov; - __u8 status; - __u32 buffer_desc_idx; - struct virtio_blk_outhdr hdr; - void *fos; - - for (;;) { /* forever */ - if (!open_backend(mic)) { /* No virtblk */ - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) - sleep(1); - continue; - } - - /* backend file is specified. */ - if (!start_virtblk(mic, &vring)) - goto _close_backend; - iovec = malloc(sizeof(*iovec) * - le32toh(virtblk_dev_page.blk_config.seg_max)); - if (!iovec) { - mpsslog("%s: can't alloc iovec: %s\n", - mic->name, strerror(ENOMEM)); - goto _stop_virtblk; - } - - block_poll.fd = mic->mic_virtblk.virtio_block_fd; - block_poll.events = POLLIN; - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) { - block_poll.revents = 0; - /* timeout in 1 sec to see signaled */ - ret = poll(&block_poll, 1, 1000); - if (ret < 0) { - mpsslog("%s %d: poll failed: %s\n", - __func__, __LINE__, - strerror(errno)); - continue; - } - - if (!(block_poll.revents & POLLIN)) { -#ifdef DEBUG - mpsslog("%s %d: block_poll.revents=0x%x\n", - __func__, __LINE__, block_poll.revents); -#endif - continue; - } - - /* POLLIN */ - while (vring.info->avail_idx != - le16toh(vring.vr.avail->idx)) { - /* read header element */ - avail_idx = - vring.info->avail_idx & - (vring.vr.num - 1); - desc_idx = le16toh( - vring.vr.avail->ring[avail_idx]); - desc = &vring.vr.desc[desc_idx]; -#ifdef DEBUG - mpsslog("%s() %d: avail_idx=%d ", - __func__, __LINE__, - vring.info->avail_idx); - mpsslog("vring.vr.num=%d desc=%p\n", - vring.vr.num, desc); -#endif - status = header_error_check(desc); - ret = read_header( - mic->mic_virtblk.virtio_block_fd, - &hdr, desc_idx); - if (ret < 0) { - mpsslog("%s() %d %s: ret=%d %s\n", - __func__, __LINE__, - mic->name, ret, - strerror(errno)); - break; - } - /* buffer element */ - piov = iovec; - status = 0; - fos = mic->mic_virtblk.backend_addr + - (hdr.sector * SECTOR_SIZE); - buffer_desc_idx = next_desc(desc); - desc_idx = buffer_desc_idx; - for (desc = &vring.vr.desc[buffer_desc_idx]; - desc->flags & VRING_DESC_F_NEXT; - desc_idx = next_desc(desc), - desc = &vring.vr.desc[desc_idx]) { - piov->iov_len = desc->len; - piov->iov_base = fos; - piov++; - fos += desc->len; - } - /* Returning NULLs for VIRTIO_BLK_T_GET_ID. */ - if (hdr.type & ~(VIRTIO_BLK_T_OUT | - VIRTIO_BLK_T_GET_ID)) { - /* - VIRTIO_BLK_T_IN - does not do - anything. Probably for documenting. - VIRTIO_BLK_T_SCSI_CMD - for - virtio_scsi. - VIRTIO_BLK_T_FLUSH - turned off in - config space. - VIRTIO_BLK_T_BARRIER - defined but not - used in anywhere. - */ - mpsslog("%s() %d: type %x ", - __func__, __LINE__, - hdr.type); - mpsslog("is not supported\n"); - status = -ENOTSUP; - - } else { - ret = transfer_blocks( - mic->mic_virtblk.virtio_block_fd, - iovec, - piov - iovec); - if (ret < 0 && - status != 0) - status = ret; - } - /* write status and update used pointer */ - if (status != 0) - status = status_error_check(desc); - ret = write_status( - mic->mic_virtblk.virtio_block_fd, - &status); -#ifdef DEBUG - mpsslog("%s() %d: write status=%d on desc=%p\n", - __func__, __LINE__, - status, desc); -#endif - } - } - free(iovec); -_stop_virtblk: - stop_virtblk(mic); -_close_backend: - close_backend(mic); - } /* forever */ - - pthread_exit(NULL); -} - -static void -reset(struct mic_info *mic) -{ -#define RESET_TIMEOUT 120 - int i = RESET_TIMEOUT; - setsysfs(mic->name, "state", "reset"); - while (i) { - char *state; - state = readsysfs(mic->name, "state"); - if (!state) - goto retry; - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - free(state); - break; - } - free(state); -retry: - sleep(1); - i--; - } -} - -static int -get_mic_shutdown_status(struct mic_info *mic, char *shutdown_status) -{ - if (!strcmp(shutdown_status, "nop")) - return MIC_NOP; - if (!strcmp(shutdown_status, "crashed")) - return MIC_CRASHED; - if (!strcmp(shutdown_status, "halted")) - return MIC_HALTED; - if (!strcmp(shutdown_status, "poweroff")) - return MIC_POWER_OFF; - if (!strcmp(shutdown_status, "restart")) - return MIC_RESTART; - mpsslog("%s: BUG invalid status %s\n", mic->name, shutdown_status); - /* Invalid state */ - assert(0); -}; - -static int get_mic_state(struct mic_info *mic) -{ - char *state = NULL; - enum mic_states mic_state; - - while (!state) { - state = readsysfs(mic->name, "state"); - sleep(1); - } - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - mic_state = MIC_READY; - } else if (!strcmp(state, "booting")) { - mic_state = MIC_BOOTING; - } else if (!strcmp(state, "online")) { - mic_state = MIC_ONLINE; - } else if (!strcmp(state, "shutting_down")) { - mic_state = MIC_SHUTTING_DOWN; - } else if (!strcmp(state, "reset_failed")) { - mic_state = MIC_RESET_FAILED; - } else if (!strcmp(state, "resetting")) { - mic_state = MIC_RESETTING; - } else { - mpsslog("%s: BUG invalid state %s\n", mic->name, state); - assert(0); - } - - free(state); - return mic_state; -}; - -static void mic_handle_shutdown(struct mic_info *mic) -{ -#define SHUTDOWN_TIMEOUT 60 - int i = SHUTDOWN_TIMEOUT; - char *shutdown_status; - while (i) { - shutdown_status = readsysfs(mic->name, "shutdown_status"); - if (!shutdown_status) { - sleep(1); - continue; - } - mpsslog("%s: %s %d shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - switch (get_mic_shutdown_status(mic, shutdown_status)) { - case MIC_RESTART: - mic->restart = 1; - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_CRASHED: - free(shutdown_status); - goto reset; - default: - break; - } - free(shutdown_status); - sleep(1); - i--; - } -reset: - if (!i) - mpsslog("%s: %s %d timing out waiting for shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - reset(mic); -} - -static int open_state_fd(struct mic_info *mic) -{ - char pathname[PATH_MAX]; - int fd; - - snprintf(pathname, PATH_MAX - 1, "%s/%s/%s", - MICSYSFSDIR, mic->name, "state"); - - fd = open(pathname, O_RDONLY); - if (fd < 0) - mpsslog("%s: opening file %s failed %s\n", - mic->name, pathname, strerror(errno)); - return fd; -} - -static int block_till_state_change(int fd, struct mic_info *mic) -{ - struct pollfd ufds[1]; - char value[PAGE_SIZE]; - int ret; - - ufds[0].fd = fd; - ufds[0].events = POLLERR | POLLPRI; - ret = poll(ufds, 1, -1); - if (ret < 0) { - mpsslog("%s: %s %d poll failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = lseek(fd, 0, SEEK_SET); - if (ret < 0) { - mpsslog("%s: %s %d Failed to seek to 0: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = read(fd, value, sizeof(value)); - if (ret < 0) { - mpsslog("%s: %s %d Failed to read sysfs entry: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - return 0; -} - -static void * -mic_config(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int fd, ret, stat = 0; - - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto exit; - } - - do { - ret = block_till_state_change(fd, mic); - if (ret < 0) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto close_exit; - } - - switch (get_mic_state(mic)) { - case MIC_SHUTTING_DOWN: - mic_handle_shutdown(mic); - break; - case MIC_READY: - case MIC_RESET_FAILED: - ret = kill(mic->pid, SIGTERM); - mpsslog("%s: %s %d kill pid %d ret %d\n", - mic->name, __func__, __LINE__, - mic->pid, ret); - if (!ret) { - ret = waitpid(mic->pid, &stat, - WIFSIGNALED(stat)); - mpsslog("%s: %s %d waitpid ret %d pid %d\n", - mic->name, __func__, __LINE__, - ret, mic->pid); - } - if (mic->boot_on_resume) { - setsysfs(mic->name, "state", "boot"); - mic->boot_on_resume = 0; - } - goto close_exit; - default: - break; - } - } while (1); - -close_exit: - close(fd); -exit: - init_mic(mic); - pthread_exit(NULL); -} - -static void -set_cmdline(struct mic_info *mic) -{ - char buffer[PATH_MAX]; - int len; - - len = snprintf(buffer, PATH_MAX, - "clocksource=tsc highres=off nohz=off "); - len += snprintf(buffer + len, PATH_MAX - len, - "cpufreq_on;corec6_off;pc3_off;pc6_off "); - len += snprintf(buffer + len, PATH_MAX - len, - "ifcfg=static;address,172.31.%d.1;netmask,255.255.255.0", - mic->id + 1); - - setsysfs(mic->name, "cmdline", buffer); - mpsslog("%s: Command line: \"%s\"\n", mic->name, buffer); - snprintf(buffer, PATH_MAX, "172.31.%d.1", mic->id + 1); - mpsslog("%s: IPADDR: \"%s\"\n", mic->name, buffer); -} - -static void -set_log_buf_info(struct mic_info *mic) -{ - int fd; - off_t len; - char system_map[] = "/lib/firmware/mic/System.map"; - char *map, *temp, log_buf[17] = {'\0'}; - - fd = open(system_map, O_RDONLY); - if (fd < 0) { - mpsslog("%s: Opening System.map failed: %d\n", - mic->name, errno); - return; - } - len = lseek(fd, 0, SEEK_END); - if (len < 0) { - mpsslog("%s: Reading System.map size failed: %d\n", - mic->name, errno); - close(fd); - return; - } - map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0); - if (map == MAP_FAILED) { - mpsslog("%s: mmap of System.map failed: %d\n", - mic->name, errno); - close(fd); - return; - } - temp = strstr(map, "__log_buf"); - if (!temp) { - mpsslog("%s: __log_buf not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_addr", log_buf); - mpsslog("%s: log_buf_addr: %s\n", mic->name, log_buf); - temp = strstr(map, "log_buf_len"); - if (!temp) { - mpsslog("%s: log_buf_len not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_len", log_buf); - mpsslog("%s: log_buf_len: %s\n", mic->name, log_buf); - munmap(map, len); - close(fd); -} - -static void -change_virtblk_backend(int x, siginfo_t *siginfo, void *p) -{ - struct mic_info *mic; - - for (mic = mic_list.next; mic != NULL; mic = mic->next) - mic->mic_virtblk.signaled = 1/* true */; -} - -static void -set_mic_boot_params(struct mic_info *mic) -{ - set_log_buf_info(mic); - set_cmdline(mic); -} - -static void * -init_mic(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - struct sigaction ignore = { - .sa_flags = 0, - .sa_handler = SIG_IGN - }; - struct sigaction act = { - .sa_flags = SA_SIGINFO, - .sa_sigaction = change_virtblk_backend, - }; - char buffer[PATH_MAX]; - int err, fd; - - /* - * Currently, one virtio block device is supported for each MIC card - * at a time. Any user (or test) can send a SIGUSR1 to the MIC daemon. - * The signal informs the virtio block backend about a change in the - * configuration file which specifies the virtio backend file name on - * the host. Virtio block backend then re-reads the configuration file - * and switches to the new block device. This signalling mechanism may - * not be required once multiple virtio block devices are supported by - * the MIC daemon. - */ - sigaction(SIGUSR1, &ignore, NULL); -retry: - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - goto retry; - } - - if (mic->restart) { - snprintf(buffer, PATH_MAX, "boot"); - setsysfs(mic->name, "state", buffer); - mpsslog("%s restarting mic %d\n", - mic->name, mic->restart); - mic->restart = 0; - } - - while (1) { - while (block_till_state_change(fd, mic)) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - continue; - } - - if (get_mic_state(mic) == MIC_BOOTING) - break; - } - - mic->pid = fork(); - switch (mic->pid) { - case 0: - add_virtio_device(mic, &virtcons_dev_page.dd); - add_virtio_device(mic, &virtnet_dev_page.dd); - err = pthread_create(&mic->mic_console.console_thread, NULL, - virtio_console, mic); - if (err) - mpsslog("%s virtcons pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_net.net_thread, NULL, - virtio_net, mic); - if (err) - mpsslog("%s virtnet pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_virtblk.block_thread, NULL, - virtio_block, mic); - if (err) - mpsslog("%s virtblk pthread_create failed %s\n", - mic->name, strerror(err)); - sigemptyset(&act.sa_mask); - err = sigaction(SIGUSR1, &act, NULL); - if (err) - mpsslog("%s sigaction SIGUSR1 failed %s\n", - mic->name, strerror(errno)); - while (1) - sleep(60); - case -1: - mpsslog("fork failed MIC name %s id %d errno %d\n", - mic->name, mic->id, errno); - break; - default: - err = pthread_create(&mic->config_thread, NULL, - mic_config, mic); - if (err) - mpsslog("%s mic_config pthread_create failed %s\n", - mic->name, strerror(err)); - } - - return NULL; -} - -static void -start_daemon(void) -{ - struct mic_info *mic; - int err; - - for (mic = mic_list.next; mic; mic = mic->next) { - set_mic_boot_params(mic); - err = pthread_create(&mic->init_thread, NULL, init_mic, mic); - if (err) - mpsslog("%s init_mic pthread_create failed %s\n", - mic->name, strerror(err)); - } - - while (1) - sleep(60); -} - -static int -init_mic_list(void) -{ - struct mic_info *mic = &mic_list; - struct dirent *file; - DIR *dp; - int cnt = 0; - - dp = opendir(MICSYSFSDIR); - if (!dp) - return 0; - - while ((file = readdir(dp)) != NULL) { - if (!strncmp(file->d_name, "mic", 3)) { - mic->next = calloc(1, sizeof(struct mic_info)); - if (mic->next) { - mic = mic->next; - mic->id = atoi(&file->d_name[3]); - mic->name = malloc(strlen(file->d_name) + 16); - if (mic->name) - strcpy(mic->name, file->d_name); - mpsslog("MIC name %s id %d\n", mic->name, - mic->id); - cnt++; - } - } - } - - closedir(dp); - return cnt; -} - -void -mpsslog(char *format, ...) -{ - va_list args; - char buffer[4096]; - char ts[52], *ts1; - time_t t; - - if (logfp == NULL) - return; - - va_start(args, format); - vsprintf(buffer, format, args); - va_end(args); - - time(&t); - ts1 = ctime_r(&t, ts); - ts1[strlen(ts1) - 1] = '\0'; - fprintf(logfp, "%s: %s", ts1, buffer); - - fflush(logfp); -} - -int -main(int argc, char *argv[]) -{ - int cnt; - pid_t pid; - - myname = argv[0]; - - logfp = fopen(LOGFILE_NAME, "a+"); - if (!logfp) { - fprintf(stderr, "cannot open logfile '%s'\n", LOGFILE_NAME); - exit(1); - } - pid = fork(); - switch (pid) { - case 0: - break; - case -1: - exit(2); - default: - exit(0); - } - - mpsslog("MIC Daemon start\n"); - - cnt = init_mic_list(); - if (cnt == 0) { - mpsslog("MIC module not loaded\n"); - exit(3); - } - mpsslog("MIC found %d devices\n", cnt); - - start_daemon(); - - exit(0); -} diff --git a/samples/mic/mpssd/mpssd.h b/samples/mic/mpssd/mpssd.h deleted file mode 100644 index 8bd64944aacc..000000000000 --- a/samples/mic/mpssd/mpssd.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC User Space Tools. - */ -#ifndef _MPSSD_H_ -#define _MPSSD_H_ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <fcntl.h> -#include <unistd.h> -#include <dirent.h> -#include <libgen.h> -#include <pthread.h> -#include <stdarg.h> -#include <time.h> -#include <errno.h> -#include <sys/dir.h> -#include <sys/ioctl.h> -#include <sys/poll.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/mman.h> -#include <sys/utsname.h> -#include <sys/wait.h> -#include <netinet/in.h> -#include <arpa/inet.h> -#include <netdb.h> -#include <pthread.h> -#include <signal.h> -#include <limits.h> -#include <syslog.h> -#include <getopt.h> -#include <net/if.h> -#include <linux/if_tun.h> -#include <linux/if_tun.h> -#include <linux/virtio_ids.h> - -#define MICSYSFSDIR "/sys/class/mic" -#define LOGFILE_NAME "/var/log/mpssd" -#define PAGE_SIZE 4096 - -struct mic_console_info { - pthread_t console_thread; - int virtio_console_fd; - void *console_dp; -}; - -struct mic_net_info { - pthread_t net_thread; - int virtio_net_fd; - int tap_fd; - void *net_dp; -}; - -struct mic_virtblk_info { - pthread_t block_thread; - int virtio_block_fd; - void *block_dp; - volatile sig_atomic_t signaled; - char *backend_file; - int backend; - void *backend_addr; - long backend_size; -}; - -struct mic_info { - int id; - char *name; - pthread_t config_thread; - pthread_t init_thread; - pid_t pid; - struct mic_console_info mic_console; - struct mic_net_info mic_net; - struct mic_virtblk_info mic_virtblk; - int restart; - int boot_on_resume; - struct mic_info *next; -}; - -__attribute__((format(printf, 1, 2))) -void mpsslog(char *format, ...); -char *readsysfs(char *dir, char *entry); -int setsysfs(char *dir, char *entry, char *value); -#endif diff --git a/samples/mic/mpssd/sysfs.c b/samples/mic/mpssd/sysfs.c deleted file mode 100644 index 8dd326936083..000000000000 --- a/samples/mic/mpssd/sysfs.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC User Space Tools. - */ - -#include "mpssd.h" - -#define PAGE_SIZE 4096 - -char * -readsysfs(char *dir, char *entry) -{ - char filename[PATH_MAX]; - char value[PAGE_SIZE]; - char *string = NULL; - int fd; - int len; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, - "%s/%s/%s", MICSYSFSDIR, dir, entry); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - return NULL; - } - - len = read(fd, value, sizeof(value)); - if (len < 0) { - mpsslog("Failed to read sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto readsys_ret; - } - if (len == 0) - goto readsys_ret; - - value[len - 1] = '\0'; - - string = malloc(strlen(value) + 1); - if (string) - strcpy(string, value); - -readsys_ret: - close(fd); - return string; -} - -int -setsysfs(char *dir, char *entry, char *value) -{ - char filename[PATH_MAX]; - char *oldvalue; - int fd, ret = 0; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, "%s/%s/%s", - MICSYSFSDIR, dir, entry); - - oldvalue = readsysfs(dir, entry); - - fd = open(filename, O_RDWR); - if (fd < 0) { - ret = errno; - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto done; - } - - if (!oldvalue || strcmp(value, oldvalue)) { - if (write(fd, value, strlen(value)) < 0) { - ret = errno; - mpsslog("Failed to write new sysfs entry '%s': %s\n", - filename, strerror(errno)); - } - } - close(fd); -done: - if (oldvalue) - free(oldvalue); - return ret; -} diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore new file mode 100644 index 000000000000..6a718eec71f4 --- /dev/null +++ b/samples/nitro_enclaves/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +/ne_ioctl_sample diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile new file mode 100644 index 000000000000..a3ec78fefb52 --- /dev/null +++ b/samples/nitro_enclaves/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample +# usage. + +.PHONY: all clean + +CFLAGS += -Wall + +all: + $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread + +clean: + rm -f ne_ioctl_sample diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c new file mode 100644 index 000000000000..765b131c7319 --- /dev/null +++ b/samples/nitro_enclaves/ne_ioctl_sample.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + */ + +/** + * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE) + * kernel driver. + * + * Usage + * ----- + * + * Load the nitro_enclaves module, setting also the enclave CPU pool. The + * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its + * siblings have to remain available for the primary / parent VM, so they + * cannot be included in the enclave CPU pool. + * + * See the cpu list section from the kernel documentation. + * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists + * + * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko + * lsmod + * + * The CPU pool can be set at runtime, after the kernel module is loaded. + * + * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus + * + * NUMA and CPU siblings information can be found using: + * + * lscpu + * /proc/cpuinfo + * + * Check the online / offline CPU list. The CPUs from the pool should be + * offlined. + * + * lscpu + * + * Check dmesg for any warnings / errors through the NE driver lifetime / usage. + * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern. + * + * dmesg + * + * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as + * the enclave CPUs. + * + * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html + * + * By default, the allocation of hugetlb pages are distributed on all possible + * NUMA nodes. Use the following configuration files to set the number of huge + * pages from a NUMA node: + * + * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages + * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages + * + * or, if not on a system with multiple NUMA nodes, can also set the number + * of 2 MiB / 1 GiB huge pages using + * + * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages + * + * In this example 256 hugepages of 2 MiB are used. + * + * Build and run the NE sample. + * + * make -C samples/nitro_enclaves clean + * make -C samples/nitro_enclaves + * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image> + * + * Unload the nitro_enclaves module. + * + * rmmod nitro_enclaves + * lsmod + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <poll.h> +#include <pthread.h> +#include <string.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <linux/mman.h> +#include <linux/nitro_enclaves.h> +#include <linux/vm_sockets.h> + +/** + * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface. + */ +#define NE_DEV_NAME "/dev/nitro_enclaves" + +/** + * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event. + */ +#define NE_POLL_WAIT_TIME (60) +/** + * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event. + */ +#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000) + +/** + * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive. + */ +#define NE_SLEEP_TIME (300) + +/** + * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave. + */ +#define NE_DEFAULT_NR_VCPUS (2) + +/** + * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB. + */ +#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024) + +/** + * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for + * an enclave. + */ +#define NE_DEFAULT_NR_MEM_REGIONS (256) + +/** + * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_CID (3) +/** + * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000) +/** + * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading. + */ +#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7) + +/** + * struct ne_user_mem_region - User space memory region set for an enclave. + * @userspace_addr: Address of the user space memory region. + * @memory_size: Size of the user space memory region. + */ +struct ne_user_mem_region { + void *userspace_addr; + size_t memory_size; +}; + +/** + * ne_create_vm() - Create a slot for the enclave VM. + * @ne_dev_fd: The file descriptor of the NE misc device. + * @slot_uid: The generated slot uid for the enclave. + * @enclave_fd : The generated file descriptor for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd) +{ + int rc = -EINVAL; + *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid); + + if (*enclave_fd < 0) { + rc = *enclave_fd; + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in create VM, no CPUs available in the NE CPU pool\n"); + + break; + } + + default: + printf("Error in create VM [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_poll_enclave_fd() - Thread function for polling the enclave fd. + * @data: Argument provided for the polling function. + * + * Context: Process context. + * Return: + * * NULL on success / failure. + */ +void *ne_poll_enclave_fd(void *data) +{ + int enclave_fd = *(int *)data; + struct pollfd fds[1] = {}; + int i = 0; + int rc = -EINVAL; + + printf("Running from poll thread, enclave fd %d\n", enclave_fd); + + fds[0].fd = enclave_fd; + fds[0].events = POLLIN | POLLERR | POLLHUP; + + /* Keep on polling until the current process is terminated. */ + while (1) { + printf("[iter %d] Polling ...\n", i); + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + return NULL; + } + + i++; + + if (!rc) { + printf("Poll: %d seconds elapsed\n", + i * NE_POLL_WAIT_TIME); + + continue; + } + + printf("Poll received value 0x%x\n", fds[0].revents); + + if (fds[0].revents & POLLHUP) { + printf("Received POLLHUP\n"); + + return NULL; + } + + if (fds[0].revents & POLLNVAL) { + printf("Received POLLNVAL\n"); + + return NULL; + } + } + + return NULL; +} + +/** + * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave. + * @ne_user_mem_region: User space memory region allocated using hugetlbfs. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region) +{ + /** + * Check available hugetlb encodings for different huge page sizes in + * include/uapi/linux/mman.h. + */ + ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_HUGETLB | MAP_HUGE_2MB, -1, 0); + if (ne_user_mem_region->userspace_addr == MAP_FAILED) { + printf("Error in mmap memory [%m]\n"); + + return -1; + } + + return 0; +} + +/** + * ne_load_enclave_image() - Place the enclave image in the enclave memory. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_regions: User space memory regions allocated for the enclave. + * @enclave_image_path : The file path of the enclave image. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[], + char *enclave_image_path) +{ + unsigned char *enclave_image = NULL; + int enclave_image_fd = -1; + size_t enclave_image_size = 0; + size_t enclave_memory_size = 0; + unsigned long i = 0; + size_t image_written_bytes = 0; + struct ne_image_load_info image_load_info = { + .flags = NE_EIF_IMAGE, + }; + struct stat image_stat_buf = {}; + int rc = -EINVAL; + size_t temp_image_offset = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + enclave_memory_size += ne_user_mem_regions[i].memory_size; + + rc = stat(enclave_image_path, &image_stat_buf); + if (rc < 0) { + printf("Error in get image stat info [%m]\n"); + + return rc; + } + + enclave_image_size = image_stat_buf.st_size; + + if (enclave_memory_size < enclave_image_size) { + printf("The enclave memory is smaller than the enclave image size\n"); + + return -ENOMEM; + } + + rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in get image load info, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in get image load info, provided invalid flag\n"); + + break; + } + + default: + printf("Error in get image load info [%m]\n"); + } + + return rc; + } + + printf("Enclave image offset in enclave memory is %lld\n", + image_load_info.memory_offset); + + enclave_image_fd = open(enclave_image_path, O_RDONLY); + if (enclave_image_fd < 0) { + printf("Error in open enclave image file [%m]\n"); + + return enclave_image_fd; + } + + enclave_image = mmap(NULL, enclave_image_size, PROT_READ, + MAP_PRIVATE, enclave_image_fd, 0); + if (enclave_image == MAP_FAILED) { + printf("Error in mmap enclave image [%m]\n"); + + return -1; + } + + temp_image_offset = image_load_info.memory_offset; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + size_t bytes_to_write = 0; + size_t memory_offset = 0; + size_t memory_size = ne_user_mem_regions[i].memory_size; + size_t remaining_bytes = 0; + void *userspace_addr = ne_user_mem_regions[i].userspace_addr; + + if (temp_image_offset >= memory_size) { + temp_image_offset -= memory_size; + + continue; + } else if (temp_image_offset != 0) { + memory_offset = temp_image_offset; + memory_size -= temp_image_offset; + temp_image_offset = 0; + } + + remaining_bytes = enclave_image_size - image_written_bytes; + bytes_to_write = memory_size < remaining_bytes ? + memory_size : remaining_bytes; + + memcpy(userspace_addr + memory_offset, + enclave_image + image_written_bytes, bytes_to_write); + + image_written_bytes += bytes_to_write; + + if (image_written_bytes == enclave_image_size) + break; + } + + munmap(enclave_image, enclave_image_size); + + close(enclave_image_fd); + + return 0; +} + +/** + * ne_set_user_mem_region() - Set a user space memory region for the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @ne_user_mem_region : User space memory region to be set for the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region) +{ + struct ne_user_memory_region mem_region = { + .flags = NE_DEFAULT_MEMORY_REGION, + .memory_size = ne_user_mem_region.memory_size, + .userspace_addr = (__u64)ne_user_mem_region.userspace_addr, + }; + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in set user memory region, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_SIZE: { + printf("Error in set user memory region, mem size not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_MEM_REGION_ADDR: { + printf("Error in set user memory region, invalid user space address\n"); + + break; + } + + case NE_ERR_UNALIGNED_MEM_REGION_ADDR: { + printf("Error in set user memory region, unaligned user space address\n"); + + break; + } + + case NE_ERR_MEM_REGION_ALREADY_USED: { + printf("Error in set user memory region, memory region already used\n"); + + break; + } + + case NE_ERR_MEM_NOT_HUGE_PAGE: { + printf("Error in set user memory region, not backed by huge pages\n"); + + break; + } + + case NE_ERR_MEM_DIFFERENT_NUMA_NODE: { + printf("Error in set user memory region, different NUMA node than CPUs\n"); + + break; + } + + case NE_ERR_MEM_MAX_REGIONS: { + printf("Error in set user memory region, max memory regions reached\n"); + + break; + } + + case NE_ERR_INVALID_PAGE_SIZE: { + printf("Error in set user memory region, has page not multiple of 2 MiB\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in set user memory region, provided invalid flag\n"); + + break; + } + + default: + printf("Error in set user memory region [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_free_mem_regions() - Unmap all the user space memory regions that were set + * aside for the enclave. + * @ne_user_mem_regions: The user space memory regions associated with an enclave. + * + * Context: Process context. + */ +static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[]) +{ + unsigned int i = 0; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) + munmap(ne_user_mem_regions[i].userspace_addr, + ne_user_mem_regions[i].memory_size); +} + +/** + * ne_add_vcpu() - Add a vCPU to the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @vcpu_id: vCPU id to be set for the enclave, either provided or + * auto-generated (if provided vCPU id is 0). + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id); + if (rc < 0) { + switch (errno) { + case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { + printf("Error in add vcpu, no CPUs available in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_ALREADY_USED: { + printf("Error in add vcpu, the provided vCPU is already used\n"); + + break; + } + + case NE_ERR_VCPU_NOT_IN_CPU_POOL: { + printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n"); + + break; + } + + case NE_ERR_VCPU_INVALID_CPU_CORE: { + printf("Error in add vcpu, the core id of the provided vCPU is invalid\n"); + + break; + } + + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in add vcpu, enclave not in init state\n"); + + break; + } + + case NE_ERR_INVALID_VCPU: { + printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n"); + + break; + } + + default: + printf("Error in add vcpu [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_start_enclave() - Start the given enclave. + * @enclave_fd : The file descriptor associated with the enclave. + * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info) +{ + int rc = -EINVAL; + + rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info); + if (rc < 0) { + switch (errno) { + case NE_ERR_NOT_IN_INIT_STATE: { + printf("Error in start enclave, enclave not in init state\n"); + + break; + } + + case NE_ERR_NO_MEM_REGIONS_ADDED: { + printf("Error in start enclave, no memory regions have been added\n"); + + break; + } + + case NE_ERR_NO_VCPUS_ADDED: { + printf("Error in start enclave, no vCPUs have been added\n"); + + break; + } + + case NE_ERR_FULL_CORES_NOT_USED: { + printf("Error in start enclave, enclave has no full cores set\n"); + + break; + } + + case NE_ERR_ENCLAVE_MEM_MIN_SIZE: { + printf("Error in start enclave, enclave memory is less than min size\n"); + + break; + } + + case NE_ERR_INVALID_FLAG_VALUE: { + printf("Error in start enclave, provided invalid flag\n"); + + break; + } + + case NE_ERR_INVALID_ENCLAVE_CID: { + printf("Error in start enclave, provided invalid enclave CID\n"); + + break; + } + + default: + printf("Error in start enclave [%m]\n"); + } + + return rc; + } + + return 0; +} + +/** + * ne_start_enclave_check_booted() - Start the enclave and wait for a heartbeat + * from it, on a newly created vsock channel, + * to check it has booted. + * @enclave_fd : The file descriptor associated with the enclave. + * + * Context: Process context. + * Return: + * * 0 on success. + * * Negative return value on failure. + */ +static int ne_start_enclave_check_booted(int enclave_fd) +{ + struct sockaddr_vm client_vsock_addr = {}; + int client_vsock_fd = -1; + socklen_t client_vsock_len = sizeof(client_vsock_addr); + struct ne_enclave_start_info enclave_start_info = {}; + struct pollfd fds[1] = {}; + int rc = -EINVAL; + unsigned char recv_buf = 0; + struct sockaddr_vm server_vsock_addr = { + .svm_family = AF_VSOCK, + .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID, + .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT, + }; + int server_vsock_fd = -1; + + server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (server_vsock_fd < 0) { + rc = server_vsock_fd; + + printf("Error in socket [%m]\n"); + + return rc; + } + + rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr, + sizeof(server_vsock_addr)); + if (rc < 0) { + printf("Error in bind [%m]\n"); + + goto out; + } + + rc = listen(server_vsock_fd, 1); + if (rc < 0) { + printf("Error in listen [%m]\n"); + + goto out; + } + + rc = ne_start_enclave(enclave_fd, &enclave_start_info); + if (rc < 0) + goto out; + + printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid); + + fds[0].fd = server_vsock_fd; + fds[0].events = POLLIN; + + rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); + if (rc < 0) { + printf("Error in poll [%m]\n"); + + goto out; + } + + if (!rc) { + printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME); + + rc = -ETIMEDOUT; + + goto out; + } + + if ((fds[0].revents & POLLIN) == 0) { + printf("Poll received value %d\n", fds[0].revents); + + rc = -EINVAL; + + goto out; + } + + rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr, + &client_vsock_len); + if (rc < 0) { + printf("Error in accept [%m]\n"); + + goto out; + } + + client_vsock_fd = rc; + + /* + * Read the heartbeat value that the init process in the enclave sends + * after vsock connect. + */ + rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in read [%m]\n"); + + goto out; + } + + if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) { + printf("Read %d instead of %d\n", recv_buf, + NE_IMAGE_LOAD_HEARTBEAT_VALUE); + + goto out; + } + + /* Write the heartbeat value back. */ + rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf)); + if (rc < 0) { + printf("Error in write [%m]\n"); + + goto out; + } + + rc = 0; + +out: + close(server_vsock_fd); + + return rc; +} + +int main(int argc, char *argv[]) +{ + int enclave_fd = -1; + unsigned int i = 0; + int ne_dev_fd = -1; + struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {}; + unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {}; + int rc = -EINVAL; + pthread_t thread_id = 0; + unsigned long slot_uid = 0; + + if (argc != 2) { + printf("Usage: %s <path_to_enclave_image>\n", argv[0]); + + exit(EXIT_FAILURE); + } + + if (strlen(argv[1]) >= PATH_MAX) { + printf("The size of the path to enclave image is higher than max path\n"); + + exit(EXIT_FAILURE); + } + + ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC); + if (ne_dev_fd < 0) { + printf("Error in open NE device [%m]\n"); + + exit(EXIT_FAILURE); + } + + printf("Creating enclave slot ...\n"); + + rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd); + + close(ne_dev_fd); + + if (rc < 0) + exit(EXIT_FAILURE); + + printf("Enclave fd %d\n", enclave_fd); + + rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd); + if (rc < 0) { + printf("Error in thread create [%m]\n"); + + close(enclave_fd); + + exit(EXIT_FAILURE); + } + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE; + + rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in alloc userspace memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]); + if (rc < 0) + goto release_enclave_fd; + + for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { + rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]); + if (rc < 0) { + printf("Error in set memory region, iter %d\n", i); + + goto release_enclave_fd; + } + } + + printf("Enclave memory regions were added\n"); + + for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) { + /* + * The vCPU is chosen from the enclave vCPU pool, if the value + * of the vcpu_id is 0. + */ + ne_vcpus[i] = 0; + rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]); + if (rc < 0) { + printf("Error in add vcpu, iter %d\n", i); + + goto release_enclave_fd; + } + + printf("Added vCPU %d to the enclave\n", ne_vcpus[i]); + } + + printf("Enclave vCPUs were added\n"); + + rc = ne_start_enclave_check_booted(enclave_fd); + if (rc < 0) { + printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc); + + goto release_enclave_fd; + } + + printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME); + + sleep(NE_SLEEP_TIME); + + close(enclave_fd); + + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_SUCCESS); + +release_enclave_fd: + close(enclave_fd); + ne_free_mem_regions(ne_user_mem_regions); + + exit(EXIT_FAILURE); +} diff --git a/samples/pfsm/.gitignore b/samples/pfsm/.gitignore new file mode 100644 index 000000000000..f350a030a060 --- /dev/null +++ b/samples/pfsm/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +/pfsm-wakeup diff --git a/samples/pfsm/Makefile b/samples/pfsm/Makefile new file mode 100644 index 000000000000..213e8d9f5dbc --- /dev/null +++ b/samples/pfsm/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += pfsm-wakeup + +userccflags += -I usr/include diff --git a/samples/pfsm/pfsm-wakeup.c b/samples/pfsm/pfsm-wakeup.c new file mode 100644 index 000000000000..299dd9e1f607 --- /dev/null +++ b/samples/pfsm/pfsm-wakeup.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TPS6594 PFSM userspace example + * + * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ + * + * This example shows how to use PFSMs from a userspace application, + * on TI j721s2 platform. The PMIC is armed to be triggered by a RTC + * alarm to execute state transition (RETENTION to ACTIVE). + */ + +#include <fcntl.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <unistd.h> + +#include <linux/rtc.h> +#include <linux/tps6594_pfsm.h> + +#define ALARM_DELTA_SEC 30 + +#define RTC_A "/dev/rtc0" + +#define PMIC_NB 3 +#define PMIC_A "/dev/pfsm-0-0x48" +#define PMIC_B "/dev/pfsm-0-0x4c" +#define PMIC_C "/dev/pfsm-2-0x58" + +static const char * const dev_pfsm[] = {PMIC_A, PMIC_B, PMIC_C}; + +int main(int argc, char *argv[]) +{ + int i, ret, fd_rtc, fd_pfsm[PMIC_NB] = { 0 }; + struct rtc_time rtc_tm; + struct pmic_state_opt pmic_opt = { 0 }; + unsigned long data; + + fd_rtc = open(RTC_A, O_RDONLY); + if (fd_rtc < 0) { + perror("Failed to open RTC device."); + goto out; + } + + for (i = 0 ; i < PMIC_NB ; i++) { + fd_pfsm[i] = open(dev_pfsm[i], O_RDWR); + if (fd_pfsm[i] < 0) { + perror("Failed to open PFSM device."); + goto out; + } + } + + /* Read RTC date/time */ + ret = ioctl(fd_rtc, RTC_RD_TIME, &rtc_tm); + if (ret < 0) { + perror("Failed to read RTC date/time."); + goto out; + } + printf("Current RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n", + rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900, + rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec); + + /* Set RTC alarm to ALARM_DELTA_SEC sec in the future, and check for rollover */ + rtc_tm.tm_sec += ALARM_DELTA_SEC; + if (rtc_tm.tm_sec >= 60) { + rtc_tm.tm_sec %= 60; + rtc_tm.tm_min++; + } + if (rtc_tm.tm_min == 60) { + rtc_tm.tm_min = 0; + rtc_tm.tm_hour++; + } + if (rtc_tm.tm_hour == 24) + rtc_tm.tm_hour = 0; + ret = ioctl(fd_rtc, RTC_ALM_SET, &rtc_tm); + if (ret < 0) { + perror("Failed to set RTC alarm."); + goto out; + } + + /* Enable alarm interrupts */ + ret = ioctl(fd_rtc, RTC_AIE_ON, 0); + if (ret < 0) { + perror("Failed to enable alarm interrupts."); + goto out; + } + printf("Waiting %d seconds for alarm...\n", ALARM_DELTA_SEC); + + /* + * Set RETENTION state with options for PMIC_C/B/A respectively. + * Since PMIC_A is master, it should be the last one to be configured. + */ + pmic_opt.ddr_retention = 1; + for (i = PMIC_NB - 1 ; i >= 0 ; i--) { + printf("Set RETENTION state for PMIC_%d.\n", i); + sleep(1); + ret = ioctl(fd_pfsm[i], PMIC_SET_RETENTION_STATE, &pmic_opt); + if (ret < 0) { + perror("Failed to set RETENTION state."); + goto out_reset; + } + } + + /* This blocks until the alarm ring causes an interrupt */ + ret = read(fd_rtc, &data, sizeof(unsigned long)); + if (ret < 0) + perror("Failed to get RTC alarm."); + else + puts("Alarm rang.\n"); + +out_reset: + ioctl(fd_rtc, RTC_AIE_OFF, 0); + + /* Set ACTIVE state for PMIC_A */ + ioctl(fd_pfsm[0], PMIC_SET_ACTIVE_STATE, 0); + +out: + for (i = 0 ; i < PMIC_NB ; i++) + if (fd_pfsm[i]) + close(fd_pfsm[i]); + + if (fd_rtc) + close(fd_rtc); + + return 0; +} diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore new file mode 100644 index 000000000000..d4cfa3176b1b --- /dev/null +++ b/samples/pidfd/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/pidfd-metadata diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile new file mode 100644 index 000000000000..9754e2d81f70 --- /dev/null +++ b/samples/pidfd/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +usertprogs-always-y += pidfd-metadata + +userccflags += -I usr/include diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c new file mode 100644 index 000000000000..c459155daf9a --- /dev/null +++ b/samples/pidfd/pidfd-metadata.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#ifndef CLONE_PIDFD +#define CLONE_PIDFD 0x00001000 +#endif + +#ifndef __NR_pidfd_send_signal +#define __NR_pidfd_send_signal -1 +#endif + +static int do_child(void *args) +{ + printf("%d\n", getpid()); + _exit(EXIT_SUCCESS); +} + +static pid_t pidfd_clone(int flags, int *pidfd) +{ + size_t stack_size = 1024; + char *stack[1024] = { 0 }; + +#ifdef __ia64__ + return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd); +#else + return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd); +#endif +} + +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, + unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int pidfd_metadata_fd(pid_t pid, int pidfd) +{ + int procfd, ret; + char path[100]; + + snprintf(path, sizeof(path), "/proc/%d", pid); + procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (procfd < 0) { + warn("Failed to open %s\n", path); + return -1; + } + + /* + * Verify that the pid has not been recycled and our /proc/<pid> handle + * is still valid. + */ + ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); + if (ret < 0) { + switch (errno) { + case EPERM: + /* Process exists, just not allowed to signal it. */ + break; + default: + warn("Failed to signal process\n"); + close(procfd); + procfd = -1; + } + } + + return procfd; +} + +int main(int argc, char *argv[]) +{ + int pidfd = -1, ret = EXIT_FAILURE; + char buf[4096] = { 0 }; + pid_t pid; + int procfd, statusfd; + ssize_t bytes; + + pid = pidfd_clone(CLONE_PIDFD, &pidfd); + if (pid < 0) + err(ret, "CLONE_PIDFD"); + if (pidfd == -1) { + warnx("CLONE_PIDFD is not supported by the kernel"); + goto out; + } + + procfd = pidfd_metadata_fd(pid, pidfd); + close(pidfd); + if (procfd < 0) + goto out; + + statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC); + close(procfd); + if (statusfd < 0) + goto out; + + bytes = read(statusfd, buf, sizeof(buf)); + if (bytes > 0) + bytes = write(STDOUT_FILENO, buf, bytes); + close(statusfd); + ret = EXIT_SUCCESS; + +out: + (void)wait(NULL); + + exit(ret); +} diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index ff8929da61c5..f4adeed5f5f0 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator) This directory contains some pktgen sample and benchmark scripts, that can easily be copied and adjusted for your own use-case. -General doc is located in kernel: Documentation/networking/pktgen.txt +General doc is located in kernel: Documentation/networking/pktgen.rst Helper include files ==================== @@ -18,8 +18,9 @@ across the sample scripts. Usage example is printed on errors:: Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX -i : ($DEV) output interface/device (required) -s : ($PKT_SIZE) packet size - -d : ($DEST_IP) destination IP + -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed -m : ($DST_MAC) destination MAC-addr + -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed -t : ($THREADS) threads to start -f : ($F_THREAD) index of first thread (zero indexed CPU number) -c : ($SKB_CLONE) SKB clones send before alloc new SKB @@ -27,10 +28,28 @@ across the sample scripts. Usage example is printed on errors:: -b : ($BURST) HW level bursting of SKBs -v : ($VERBOSE) verbose -x : ($DEBUG) debug + -6 : ($IP6) IPv6 + -w : ($DELAY) Tx Delay value (ns) + -a : ($APPEND) Script will not reset generator's state, but will append its config The global variable being set is also listed. E.g. the required interface/device parameter "-i" sets variable $DEV. +"-a" parameter may be used to create different flows simultaneously. +In this mode script will keep the existing config, will append its settings. +In this mode you'll have to manually run traffic with "pg_ctrl start". + +For example you may use: + + source ./samples/pktgen/functions.sh + pg_ctrl reset + # add first device + ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f0 -m 34:80:0d:a3:fc:c9 -t 8 + # add second device + ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f1 -m 34:80:0d:a3:fc:c9 -t 8 + # run joint traffic on two devs + pg_ctrl start + Common functions ---------------- The functions.sh file provides; Three different shell functions for diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index 205e4cde4601..c08cefb8eb1f 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -5,6 +5,8 @@ # Author: Jesper Dangaaard Brouer # License: GPL +set -o errexit + ## -- General shell logging cmds -- function err() { local exitcode=$1 @@ -58,6 +60,7 @@ function pg_set() { function proc_cmd() { local result local proc_file=$1 + local status=0 # after shift, the remaining args are contained in $@ shift local proc_ctrl=${PROC_DIR}/$proc_file @@ -73,13 +76,13 @@ function proc_cmd() { echo "cmd: $@ > $proc_ctrl" fi # Quoting of "$@" is important for space expansion - echo "$@" > "$proc_ctrl" - local status=$? + echo "$@" > "$proc_ctrl" || status=$? - result=$(grep "Result: OK:" $proc_ctrl) - # Due to pgctrl, cannot use exit code $? from grep - if [[ "$result" == "" ]]; then - grep "Result:" $proc_ctrl >&2 + if [[ "$proc_file" != "pgctrl" ]]; then + result=$(grep "Result: OK:" $proc_ctrl) || true + if [[ "$result" == "" ]]; then + grep "Result:" $proc_ctrl >&2 + fi fi if (( $status != 0 )); then err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\"" @@ -105,6 +108,14 @@ function pgset() { fi } +function trap_exit() +{ + # Cleanup pktgen setup on exit if thats not "append mode" + if [[ -z "$APPEND" ]] && [[ $EUID -eq 0 ]]; then + trap 'pg_ctrl "reset"' EXIT + fi +} + ## -- General shell tricks -- function root_check_run_with_sudo() { @@ -113,9 +124,217 @@ function root_check_run_with_sudo() { if [ "$EUID" -ne 0 ]; then if [ -x $0 ]; then # Directly executable use sudo info "Not root, running with sudo" - sudo "$0" "$@" + sudo -E "$0" "$@" exit $? fi err 4 "cannot perform sudo run of $0" fi } + +# Exact input device's NUMA node info +function get_iface_node() +{ + local node=$(</sys/class/net/$1/device/numa_node) + if [[ $node == -1 ]]; then + echo 0 + else + echo $node + fi +} + +# Given an Dev/iface, get its queues' irq numbers +function get_iface_irqs() +{ + local IFACE=$1 + local queues="${IFACE}-.*TxRx" + + irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\ + do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\ + done) + [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE" + + echo $irqs +} + +# Given a NUMA node, return cpu ids belonging to it. +function get_node_cpus() +{ + local node=$1 + local node_cpu_list + local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \ + /sys/devices/system/node/node$node/cpulist` + + for cpu_range in $node_cpu_range_list + do + node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }` + done + + echo $node_cpu_list +} + +# Check $1 is in between $2, $3 ($2 <= $1 <= $3) +function in_between() { [[ ($1 -ge $2) && ($1 -le $3) ]] ; } + +# Extend shrunken IPv6 address. +# fe80::42:bcff:fe84:e10a => fe80:0:0:0:42:bcff:fe84:e10a +function extend_addr6() +{ + local addr=$1 + local sep=: sep2=:: + local sep_cnt=$(tr -cd $sep <<< $1 | wc -c) + local shrink + + # separator count should be (2 <= $sep_cnt <= 7) + if ! (in_between $sep_cnt 2 7); then + err 5 "Invalid IP6 address: $1" + fi + + # if shrink '::' occurs multiple, it's malformed. + shrink=( $(grep -E -o "$sep{2,}" <<< $addr) ) + if [[ ${#shrink[@]} -ne 0 ]]; then + if [[ ${#shrink[@]} -gt 1 || ( ${shrink[0]} != $sep2 ) ]]; then + err 5 "Invalid IP6 address: $1" + fi + fi + + # add 0 at begin & end, and extend addr by adding :0 + [[ ${addr:0:1} == $sep ]] && addr=0${addr} + [[ ${addr: -1} == $sep ]] && addr=${addr}0 + echo "${addr/$sep2/$(printf ':0%.s' $(seq $[8-sep_cnt])):}" +} + +# Given a single IP(v4/v6) address, whether it is valid. +function validate_addr() +{ + # check function is called with (funcname)6 + [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 + local bitlen=$[ IP6 ? 128 : 32 ] + local len=$[ IP6 ? 8 : 4 ] + local max=$[ 2**(len*2)-1 ] + local net prefix + local addr sep + + IFS='/' read net prefix <<< $1 + [[ $IP6 ]] && net=$(extend_addr6 $net) + + # if prefix exists, check (0 <= $prefix <= $bitlen) + if [[ -n $prefix ]]; then + if ! (in_between $prefix 0 $bitlen); then + err 5 "Invalid prefix: /$prefix" + fi + fi + + # set separator for each IP(v4/v6) + [[ $IP6 ]] && sep=: || sep=. + IFS=$sep read -a addr <<< $net + + # array length + if [[ ${#addr[@]} != $len ]]; then + err 5 "Invalid IP$IP6 address: $1" + fi + + # check each digit (0 <= $digit <= $max) + for digit in "${addr[@]}"; do + [[ $IP6 ]] && digit=$[ 16#$digit ] + if ! (in_between $digit 0 $max); then + err 5 "Invalid IP$IP6 address: $1" + fi + done + + return 0 +} + +function validate_addr6() { validate_addr $@ ; } + +# Given a single IP(v4/v6) or CIDR, return minimum and maximum IP addr. +function parse_addr() +{ + # check function is called with (funcname)6 + [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 + local net prefix + local min_ip max_ip + + IFS='/' read net prefix <<< $1 + [[ $IP6 ]] && net=$(extend_addr6 $net) + + if [[ -z $prefix ]]; then + min_ip=$net + max_ip=$net + else + # defining array for converting Decimal 2 Binary + # 00000000 00000001 00000010 00000011 00000100 ... + local d2b='{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}' + [[ $IP6 ]] && d2b+=$d2b + eval local D2B=($d2b) + + local bitlen=$[ IP6 ? 128 : 32 ] + local remain=$[ bitlen-prefix ] + local octet=$[ IP6 ? 16 : 8 ] + local min_mask max_mask + local min max + local ip_bit + local ip sep + + # set separator for each IP(v4/v6) + [[ $IP6 ]] && sep=: || sep=. + IFS=$sep read -ra ip <<< $net + + min_mask="$(printf '1%.s' $(seq $prefix))$(printf '0%.s' $(seq $remain))" + max_mask="$(printf '0%.s' $(seq $prefix))$(printf '1%.s' $(seq $remain))" + + # calculate min/max ip with &,| operator + for i in "${!ip[@]}"; do + digit=$[ IP6 ? 16#${ip[$i]} : ${ip[$i]} ] + ip_bit=${D2B[$digit]} + + idx=$[ octet*i ] + min[$i]=$[ 2#$ip_bit & 2#${min_mask:$idx:$octet} ] + max[$i]=$[ 2#$ip_bit | 2#${max_mask:$idx:$octet} ] + [[ $IP6 ]] && { min[$i]=$(printf '%X' ${min[$i]}); + max[$i]=$(printf '%X' ${max[$i]}); } + done + + min_ip=$(IFS=$sep; echo "${min[*]}") + max_ip=$(IFS=$sep; echo "${max[*]}") + fi + + echo $min_ip $max_ip +} + +function parse_addr6() { parse_addr $@ ; } + +# Given a single or range of port(s), return minimum and maximum port number. +function parse_ports() +{ + local port_str=$1 + local port_list + local min_port + local max_port + + IFS="-" read -ra port_list <<< $port_str + + min_port=${port_list[0]} + max_port=${port_list[1]:-$min_port} + + echo $min_port $max_port +} + +# Given a minimum and maximum port, verify port number. +function validate_ports() +{ + local min_port=$1 + local max_port=$2 + + # 1 <= port <= 65535 + if (in_between $min_port 1 65535); then + if (in_between $max_port 1 65535); then + if [[ $min_port -le $max_port ]]; then + return 0 + fi + fi + fi + + err 5 "Invalid port(s): $min_port-$max_port" +} diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh index 3a6244d5f47a..81906f199454 100644 --- a/samples/pktgen/parameters.sh +++ b/samples/pktgen/parameters.sh @@ -1,4 +1,5 @@ # +# SPDX-License-Identifier: GPL-2.0 # Common parameter parsing for pktgen scripts # @@ -7,8 +8,10 @@ function usage() { echo "Usage: $0 [-vx] -i ethX" echo " -i : (\$DEV) output interface/device (required)" echo " -s : (\$PKT_SIZE) packet size" - echo " -d : (\$DEST_IP) destination IP" + echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed" echo " -m : (\$DST_MAC) destination MAC-addr" + echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed" + echo " -k : (\$UDP_CSUM) enable UDP tx checksum" echo " -t : (\$THREADS) threads to start" echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)" echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB" @@ -17,12 +20,14 @@ function usage() { echo " -v : (\$VERBOSE) verbose" echo " -x : (\$DEBUG) debug" echo " -6 : (\$IP6) IPv6" + echo " -w : (\$DELAY) Tx Delay value (ns)" + echo " -a : (\$APPEND) Script will not reset generator's state, but will append its config" echo "" } ## --- Parse command line arguments / parameters --- ## echo "Commandline options:" -while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do +while getopts "s:i:d:m:p:f:t:c:n:b:w:vxh6ak" option; do case $option in i) # interface export DEV=$OPTARG @@ -40,6 +45,10 @@ while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do export DST_MAC=$OPTARG info "Destination MAC set to: DST_MAC=$DST_MAC" ;; + p) # PORT + export DST_PORT=$OPTARG + info "Destination PORT set to: DST_PORT=$DST_PORT" + ;; f) export F_THREAD=$OPTARG info "Index of first thread (zero indexed CPU number): $F_THREAD" @@ -60,6 +69,10 @@ while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do export BURST=$OPTARG info "SKB bursting: BURST=$BURST" ;; + w) + export DELAY=$OPTARG + info "DELAY=$DELAY" + ;; v) export VERBOSE=yes info "Verbose mode: VERBOSE=$VERBOSE" @@ -72,6 +85,14 @@ while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do export IP6=6 info "IP6: IP6=$IP6" ;; + a) + export APPEND=yes + info "Append mode: APPEND=$APPEND" + ;; + k) + export UDP_CSUM=yes + info "UDP tx checksum: UDP_CSUM=$UDP_CSUM" + ;; h|?|*) usage; err 2 "[ERROR] Unknown parameters!!!" @@ -94,6 +115,9 @@ if [ -z "$THREADS" ]; then export THREADS=1 fi +# default DELAY +[ -z "$DELAY" ] && export DELAY=0 # Zero means max speed + export L_THREAD=$(( THREADS + F_THREAD - 1 )) if [ -z "$DEV" ]; then diff --git a/samples/pktgen/pktgen.conf-1-1-ip6 b/samples/pktgen/pktgen.conf-1-1-ip6 deleted file mode 100755 index 0b9ffd47fd41..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6 +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6 fec0::1" - pgset "src6 fec0::2" - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-1-ip6-rdos b/samples/pktgen/pktgen.conf-1-1-ip6-rdos deleted file mode 100755 index ad98e5f40776..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6-rdos +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 means maximum speed. - -# We need to do alloc for every skb since we cannot clone here. -CLONE_SKB="clone_skb 0" - -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6_min fec0::1" - pgset "dst6_max fec0::FFFF:FFFF" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-2 b/samples/pktgen/pktgen.conf-1-2 deleted file mode 100755 index ba4eb26e168d..000000000000 --- a/samples/pktgen/pktgen.conf-1-2 +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# One CPU means one thread. One CPU example. We add eth1, eth2 respectivly. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - echo "Adding eth2" - pgset "add_device eth2" - - -# device config -# delay 0 means maximum speed. - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 10.10.11.2" - pgset "dst_mac 00:04:23:08:91:dc" - -PGDEV=/proc/net/pktgen/eth2 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 192.168.2.2" - pgset "dst_mac 00:04:23:08:91:de" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 /proc/net/pktgen/eth2 diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh index e5bfe759a0fb..b4328db4a164 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Benchmark script: # - developed for benchmarking ingress qdisc path @@ -32,6 +33,10 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + # Using invalid DST_MAC will cause the packets to get dropped in # ip_rcv() which is part of the test if [ -z "$DEST_IP" ]; then @@ -40,9 +45,14 @@ fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=1024 [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely - -# Base Config -DELAY="0" # Zero means max speed +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # General cleanup everything since last run pg_ctrl "reset" @@ -66,7 +76,15 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst$IP6 $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi # Inject packet into RX path of stack pg_set $dev "xmit_mode netif_receive" @@ -75,14 +93,21 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "burst $BURST" done +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + # start_run echo "Running... ctrl^C to stop" >&2 pg_ctrl "start" echo "Done" >&2 -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +print_result diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh index 1ad878e95539..f2beb512c5cd 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Benchmark script: # - developed for benchmarking egress qdisc path, derived (more @@ -13,6 +14,10 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + if [ -z "$DEST_IP" ]; then [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" fi @@ -23,9 +28,14 @@ if [[ -n "$BURST" ]]; then err 1 "Bursting not supported for this mode" fi [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely - -# Base Config -DELAY="0" # Zero means max speed +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # General cleanup everything since last run pg_ctrl "reset" @@ -49,20 +59,35 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst$IP6 $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi # Inject packet into TX qdisc egress path of stack pg_set $dev "xmit_mode queue_xmit" done +# Run if user hits control-c +function print_result { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + # start_run echo "Running... ctrl^C to stop" >&2 pg_ctrl "start" echo "Done" >&2 -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +print_result diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh index 35b7fe34bda2..66cb707479e6 100755 --- a/samples/pktgen/pktgen_sample01_simple.sh +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Simple example: # * pktgen sending with single thread and single interface @@ -12,6 +13,10 @@ root_check_run_with_sudo "$@" # - go look in parameters.sh to see which setting are avail # - required param is the interface "-i" stored in $DEV source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + # # Set some default params, if they didn't get set if [ -z "$DEST_IP" ]; then @@ -21,21 +26,26 @@ fi # Example enforce param "-m" for dst_mac [ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac" [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely - -# Base Config -DELAY="0" # Zero means max speed +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # Flow variation random source port between min and max -UDP_MIN=9 -UDP_MAX=109 +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 # General cleanup everything since last run # (especially important if other threads were configured by other scripts) -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Add remove all other devices and add_device $DEV to thread 0 thread=0 -pg_thread $thread "rem_device_all" +[ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $DEV # How many packets to send (zero means indefinitely) @@ -56,18 +66,39 @@ pg_set $DEV "flag NO_TIMESTAMP" # Destination pg_set $DEV "dst_mac $DST_MAC" -pg_set $DEV "dst$IP6 $DEST_IP" +pg_set $DEV "dst${IP6}_min $DST_MIN" +pg_set $DEV "dst${IP6}_max $DST_MAX" + +if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $DEV "flag UDPDST_RND" + pg_set $DEV "udp_dst_min $UDP_DST_MIN" + pg_set $DEV "udp_dst_max $UDP_DST_MAX" +fi + +[ ! -z "$UDP_CSUM" ] && pg_set $DEV "flag UDPCSUM" # Setup random UDP port src range pg_set $DEV "flag UDPSRC_RND" -pg_set $DEV "udp_src_min $UDP_MIN" -pg_set $DEV "udp_src_max $UDP_MAX" +pg_set $DEV "udp_src_min $UDP_SRC_MIN" +pg_set $DEV "udp_src_max $UDP_SRC_MAX" + +# Run if user hits control-c +function print_result() { + # Print results + echo "Result device: $DEV" + cat /proc/net/pktgen/$DEV +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT -# start_run -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" -echo "Done" >&2 +if [ -z "$APPEND" ]; then + # start_run + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 -# Print results -echo "Result device: $DEV" -cat /proc/net/pktgen/$DEV + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi
\ No newline at end of file diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh index cbdd3e2bceff..93f33d7d0a81 100755 --- a/samples/pktgen/pktgen_sample02_multiqueue.sh +++ b/samples/pktgen/pktgen_sample02_multiqueue.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Multiqueue: Using pktgen threads for sending on multiple CPUs # * adding devices to kernel threads @@ -13,24 +14,34 @@ root_check_run_with_sudo "$@" # Required param: -i dev in $DEV source ${basedir}/parameters.sh +# Trap EXIT first +trap_exit + [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely # Base Config -DELAY="0" # Zero means max speed [ -z "$CLONE_SKB" ] && CLONE_SKB="0" # Flow variation random source port between min and max -UDP_MIN=9 -UDP_MAX=109 +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 # (example of setting default params in your script) if [ -z "$DEST_IP" ]; then [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do @@ -39,7 +50,7 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Notice config queue to map to cpu (mirrors smp_processor_id()) @@ -57,22 +68,43 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst$IP6 $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" # Setup random UDP port src range pg_set $dev "flag UDPSRC_RND" - pg_set $dev "udp_src_min $UDP_MIN" - pg_set $dev "udp_src_max $UDP_MAX" + pg_set $dev "udp_src_min $UDP_SRC_MIN" + pg_set $dev "udp_src_max $UDP_SRC_MAX" done -# start_run -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" -echo "Done" >&2 +# Run if user hits control-c +function print_result() { + # Print results + for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT -# Print results -for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do - dev=${DEV}@${thread} - echo "Device: $dev" - cat /proc/net/pktgen/$dev | grep -A2 "Result:" -done +if [ -z "$APPEND" ]; then + # start_run + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index 8d26e0ca683d..8f8ed1ac46a0 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Script for max single flow performance # - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2] @@ -24,27 +25,36 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + # Set some default params, if they didn't get set if [ -z "$DEST_IP" ]; then [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=32 -[ -z "$CLONE_SKB" ] && CLONE_SKB="100000" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely - -# Base Config -DELAY="0" # Zero means max speed +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -57,7 +67,17 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst$IP6 $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" # Setup burst, for easy testing -b 0 disable bursting # (internally in pktgen default and minimum burst=1) @@ -69,7 +89,7 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do done # Run if user hits control-c -function control_c() { +function print_result() { # Print results for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} @@ -78,7 +98,13 @@ function control_c() { done } # trap keyboard interrupt (Ctrl-C) -trap control_c SIGINT +trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh index 497fb7520464..65ed486ce4f1 100755 --- a/samples/pktgen/pktgen_sample04_many_flows.sh +++ b/samples/pktgen/pktgen_sample04_many_flows.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Script example for many flows testing # @@ -11,11 +12,25 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + # Set some default params, if they didn't get set -[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # NOTICE: Script specific settings # ======= @@ -25,22 +40,22 @@ source ${basedir}/parameters.sh [ -z "$FLOWS" ] && FLOWS="8000" [ -z "$FLOWLEN" ] && FLOWLEN="10" -# Base Config -DELAY="0" # Zero means max speed - if [[ -n "$BURST" ]]; then err 1 "Bursting not supported for this mode" fi +# 198.18.0.0 / 198.19.255.255 +read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15) + # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -53,12 +68,22 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Single destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" # Randomize source IP-addresses pg_set $dev "flag IPSRC_RND" - pg_set $dev "src_min 198.18.0.0" - pg_set $dev "src_max 198.19.255.255" + pg_set $dev "src_min $SRC_MIN" + pg_set $dev "src_max $SRC_MAX" # Limit number of flows (max 65535) pg_set $dev "flows $FLOWS" @@ -87,7 +112,11 @@ function print_result() { # trap keyboard interrupt (Ctrl-C) trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" -print_result + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh index ac9cfd6b2c0a..bcbc386b2284 100755 --- a/samples/pktgen/pktgen_sample05_flow_per_thread.sh +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh @@ -1,4 +1,5 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 # # Script will generate one flow per thread (-t N) # - Same destination IP @@ -15,26 +16,36 @@ root_check_run_with_sudo "$@" # Parameter parsing via include source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + # Set some default params, if they didn't get set -[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$BURST" ] && BURST=32 [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely - - -# Base Config -DELAY="0" # Zero means max speed +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi # General cleanup everything since last run -pg_ctrl "reset" +[ -z "$APPEND" ] && pg_ctrl "reset" # Threads are specified with parameter -t value in $THREADS for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do dev=${DEV}@${thread} # Add remove all other devices and add_device $dev to thread - pg_thread $thread "rem_device_all" + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" pg_thread $thread "add_device" $dev # Base config @@ -47,7 +58,17 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do # Single destination pg_set $dev "dst_mac $DST_MAC" - pg_set $dev "dst $DEST_IP" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" # Setup source IP-addresses based on thread number pg_set $dev "src_min 198.18.$((thread+1)).1" @@ -75,7 +96,11 @@ function print_result() { # trap keyboard interrupt (Ctrl-C) trap true SIGINT -echo "Running... ctrl^C to stop" >&2 -pg_ctrl "start" +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" -print_result + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000000..0c5409cb5bab --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads which are in the same NUMA node +# * bound devices queue's irq affinity to the threads, 1:1 mapping +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +# Trap EXIT first +trap_exit + +# Base Config +[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_SRC_MIN=9 +UDP_SRC_MAX=109 + +node=`get_iface_node $DEV` +irq_array=(`get_iface_irqs $DEV`) +cpu_array=(`get_node_cpus $node`) + +[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ + err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DEST_IP" ]; then + validate_addr${IP6} $DEST_IP + read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) +fi +if [ -n "$DST_PORT" ]; then + read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $UDP_DST_MIN $UDP_DST_MAX +fi + +# General cleanup everything since last run +[ -z "$APPEND" ] && pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((i = 0; i < $THREADS; i++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + # Set the queue's irq affinity to this $thread (processor) + # if '-f' is designated, offset cpu id + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list + info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" + + # Add remove all other devices and add_device $dev to thread + [ -z "$APPEND" ] && pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # select queue and bind the queue and $dev in 1:1 relationship + queue_num=$i + info "queue number is $queue_num" + pg_set $dev "queue_map_min $queue_num" + pg_set $dev "queue_map_max $queue_num" + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst${IP6}_min $DST_MIN" + pg_set $dev "dst${IP6}_max $DST_MAX" + + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $UDP_DST_MIN" + pg_set $dev "udp_dst_max $UDP_DST_MAX" + fi + + [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_SRC_MIN" + pg_set $dev "udp_src_max $UDP_SRC_MAX" +done + +# Run if user hits control-c +function print_result() { + # Print results + for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" + done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + +# start_run +if [ -z "$APPEND" ]; then + echo "Running... ctrl^C to stop" >&2 + pg_ctrl "start" + echo "Done" >&2 + + print_result +else + echo "Append mode: config done. Do more or use 'pg_ctrl start' to run" +fi diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile new file mode 100644 index 000000000000..641943d40c4a --- /dev/null +++ b/samples/qmi/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c new file mode 100644 index 000000000000..d1814582319b --- /dev/null +++ b/samples/qmi/qmi_sample_client.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Sample in-kernel QMI client driver + * + * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved. + * Copyright (C) 2017 Linaro Ltd. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/qrtr.h> +#include <linux/net.h> +#include <linux/completion.h> +#include <linux/idr.h> +#include <linux/string.h> +#include <net/sock.h> +#include <linux/soc/qcom/qmi.h> + +#define PING_REQ1_TLV_TYPE 0x1 +#define PING_RESP1_TLV_TYPE 0x2 +#define PING_OPT1_TLV_TYPE 0x10 +#define PING_OPT2_TLV_TYPE 0x11 + +#define DATA_REQ1_TLV_TYPE 0x1 +#define DATA_RESP1_TLV_TYPE 0x2 +#define DATA_OPT1_TLV_TYPE 0x10 +#define DATA_OPT2_TLV_TYPE 0x11 + +#define TEST_MED_DATA_SIZE_V01 8192 +#define TEST_MAX_NAME_SIZE_V01 255 + +#define TEST_PING_REQ_MSG_ID_V01 0x20 +#define TEST_DATA_REQ_MSG_ID_V01 0x21 + +#define TEST_PING_REQ_MAX_MSG_LEN_V01 266 +#define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456 + +struct test_name_type_v01 { + u32 name_len; + char name[TEST_MAX_NAME_SIZE_V01]; +}; + +static const struct qmi_elem_info test_name_type_v01_ei[] = { + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = QMI_COMMON_TLV_TYPE, + .offset = offsetof(struct test_name_type_v01, + name_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MAX_NAME_SIZE_V01, + .elem_size = sizeof(char), + .array_type = VAR_LEN_ARRAY, + .tlv_type = QMI_COMMON_TLV_TYPE, + .offset = offsetof(struct test_name_type_v01, + name), + }, + {} +}; + +struct test_ping_req_msg_v01 { + char ping[4]; + + u8 client_name_valid; + struct test_name_type_v01 client_name; +}; + +static const struct qmi_elem_info test_ping_req_msg_v01_ei[] = { + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = 4, + .elem_size = sizeof(char), + .array_type = STATIC_ARRAY, + .tlv_type = PING_REQ1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + ping), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + client_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_req_msg_v01, + client_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_ping_resp_msg_v01 { + struct qmi_response_type_v01 resp; + + u8 pong_valid; + char pong[4]; + + u8 service_name_valid; + struct test_name_type_v01 service_name; +}; + +static const struct qmi_elem_info test_ping_resp_msg_v01_ei[] = { + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct qmi_response_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_RESP1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + resp), + .ei_array = qmi_response_type_v01_ei, + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + pong_valid), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = 4, + .elem_size = sizeof(char), + .array_type = STATIC_ARRAY, + .tlv_type = PING_OPT1_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + pong), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT2_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + service_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = PING_OPT2_TLV_TYPE, + .offset = offsetof(struct test_ping_resp_msg_v01, + service_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_data_req_msg_v01 { + u32 data_len; + u8 data[TEST_MED_DATA_SIZE_V01]; + + u8 client_name_valid; + struct test_name_type_v01 client_name; +}; + +static const struct qmi_elem_info test_data_req_msg_v01_ei[] = { + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u32), + .array_type = NO_ARRAY, + .tlv_type = DATA_REQ1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + data_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MED_DATA_SIZE_V01, + .elem_size = sizeof(u8), + .array_type = VAR_LEN_ARRAY, + .tlv_type = DATA_REQ1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + data), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + client_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_req_msg_v01, + client_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +struct test_data_resp_msg_v01 { + struct qmi_response_type_v01 resp; + + u8 data_valid; + u32 data_len; + u8 data[TEST_MED_DATA_SIZE_V01]; + + u8 service_name_valid; + struct test_name_type_v01 service_name; +}; + +static const struct qmi_elem_info test_data_resp_msg_v01_ei[] = { + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct qmi_response_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_RESP1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + resp), + .ei_array = qmi_response_type_v01_ei, + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data_valid), + }, + { + .data_type = QMI_DATA_LEN, + .elem_len = 1, + .elem_size = sizeof(u32), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data_len), + }, + { + .data_type = QMI_UNSIGNED_1_BYTE, + .elem_len = TEST_MED_DATA_SIZE_V01, + .elem_size = sizeof(u8), + .array_type = VAR_LEN_ARRAY, + .tlv_type = DATA_OPT1_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + data), + }, + { + .data_type = QMI_OPT_FLAG, + .elem_len = 1, + .elem_size = sizeof(u8), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT2_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + service_name_valid), + }, + { + .data_type = QMI_STRUCT, + .elem_len = 1, + .elem_size = sizeof(struct test_name_type_v01), + .array_type = NO_ARRAY, + .tlv_type = DATA_OPT2_TLV_TYPE, + .offset = offsetof(struct test_data_resp_msg_v01, + service_name), + .ei_array = test_name_type_v01_ei, + }, + {} +}; + +/* + * ping_write() - ping_pong debugfs file write handler + * @file: debugfs file context + * @user_buf: reference to the user data (ignored) + * @count: number of bytes in @user_buf + * @ppos: offset in @file to write + * + * This function allows user space to send out a ping_pong QMI encoded message + * to the associated remote test service and will return with the result of the + * transaction. It serves as an example of how to provide a custom response + * handler. + * + * Return: @count, or negative errno on failure. + */ +static ssize_t ping_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct qmi_handle *qmi = file->private_data; + struct test_ping_req_msg_v01 req = {}; + struct qmi_txn txn; + int ret; + + memcpy(req.ping, "ping", sizeof(req.ping)); + + ret = qmi_txn_init(qmi, &txn, NULL, NULL); + if (ret < 0) + return ret; + + ret = qmi_send_request(qmi, NULL, &txn, + TEST_PING_REQ_MSG_ID_V01, + TEST_PING_REQ_MAX_MSG_LEN_V01, + test_ping_req_msg_v01_ei, &req); + if (ret < 0) { + qmi_txn_cancel(&txn); + return ret; + } + + ret = qmi_txn_wait(&txn, 5 * HZ); + if (ret < 0) + count = ret; + + return count; +} + +static const struct file_operations ping_fops = { + .open = simple_open, + .write = ping_write, +}; + +static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, + struct qmi_txn *txn, const void *data) +{ + const struct test_ping_resp_msg_v01 *resp = data; + + if (!txn) { + pr_err("spurious ping response\n"); + return; + } + + if (resp->resp.result == QMI_RESULT_FAILURE_V01) + txn->result = -ENXIO; + else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4)) + txn->result = -EINVAL; + + complete(&txn->completion); +} + +/* + * data_write() - data debugfs file write handler + * @file: debugfs file context + * @user_buf: reference to the user data + * @count: number of bytes in @user_buf + * @ppos: offset in @file to write + * + * This function allows user space to send out a data QMI encoded message to + * the associated remote test service and will return with the result of the + * transaction. It serves as an example of how to have the QMI helpers decode a + * transaction response into a provided object automatically. + * + * Return: @count, or negative errno on failure. + */ +static ssize_t data_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) + +{ + struct qmi_handle *qmi = file->private_data; + struct test_data_resp_msg_v01 *resp; + struct test_data_req_msg_v01 *req; + struct qmi_txn txn; + int ret; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + resp = kzalloc(sizeof(*resp), GFP_KERNEL); + if (!resp) { + kfree(req); + return -ENOMEM; + } + + req->data_len = min_t(size_t, sizeof(req->data), count); + if (copy_from_user(req->data, user_buf, req->data_len)) { + ret = -EFAULT; + goto out; + } + + ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp); + if (ret < 0) + goto out; + + ret = qmi_send_request(qmi, NULL, &txn, + TEST_DATA_REQ_MSG_ID_V01, + TEST_DATA_REQ_MAX_MSG_LEN_V01, + test_data_req_msg_v01_ei, req); + if (ret < 0) { + qmi_txn_cancel(&txn); + goto out; + } + + ret = qmi_txn_wait(&txn, 5 * HZ); + if (ret < 0) { + goto out; + } else if (!resp->data_valid || + resp->data_len != req->data_len || + memcmp(resp->data, req->data, req->data_len)) { + pr_err("response data doesn't match expectation\n"); + ret = -EINVAL; + goto out; + } + + ret = count; + +out: + kfree(resp); + kfree(req); + + return ret; +} + +static const struct file_operations data_fops = { + .open = simple_open, + .write = data_write, +}; + +static const struct qmi_msg_handler qmi_sample_handlers[] = { + { + .type = QMI_RESPONSE, + .msg_id = TEST_PING_REQ_MSG_ID_V01, + .ei = test_ping_resp_msg_v01_ei, + .decoded_size = sizeof(struct test_ping_req_msg_v01), + .fn = ping_pong_cb + }, + {} +}; + +struct qmi_sample { + struct qmi_handle qmi; + + struct dentry *de_dir; + struct dentry *de_data; + struct dentry *de_ping; +}; + +static struct dentry *qmi_debug_dir; + +static int qmi_sample_probe(struct platform_device *pdev) +{ + struct sockaddr_qrtr *sq; + struct qmi_sample *sample; + char path[20]; + int ret; + + sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL); + if (!sample) + return -ENOMEM; + + ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01, + NULL, + qmi_sample_handlers); + if (ret < 0) + return ret; + + sq = dev_get_platdata(&pdev->dev); + ret = kernel_connect(sample->qmi.sock, (struct sockaddr_unsized *)sq, + sizeof(*sq), 0); + if (ret < 0) { + pr_err("failed to connect to remote service port\n"); + goto err_release_qmi_handle; + } + + snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port); + + sample->de_dir = debugfs_create_dir(path, qmi_debug_dir); + if (IS_ERR(sample->de_dir)) { + ret = PTR_ERR(sample->de_dir); + goto err_release_qmi_handle; + } + + sample->de_data = debugfs_create_file("data", 0600, sample->de_dir, + sample, &data_fops); + if (IS_ERR(sample->de_data)) { + ret = PTR_ERR(sample->de_data); + goto err_remove_de_dir; + } + + sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir, + sample, &ping_fops); + if (IS_ERR(sample->de_ping)) { + ret = PTR_ERR(sample->de_ping); + goto err_remove_de_data; + } + + platform_set_drvdata(pdev, sample); + + return 0; + +err_remove_de_data: + debugfs_remove(sample->de_data); +err_remove_de_dir: + debugfs_remove(sample->de_dir); +err_release_qmi_handle: + qmi_handle_release(&sample->qmi); + + return ret; +} + +static void qmi_sample_remove(struct platform_device *pdev) +{ + struct qmi_sample *sample = platform_get_drvdata(pdev); + + debugfs_remove(sample->de_ping); + debugfs_remove(sample->de_data); + debugfs_remove(sample->de_dir); + + qmi_handle_release(&sample->qmi); +} + +static struct platform_driver qmi_sample_driver = { + .probe = qmi_sample_probe, + .remove = qmi_sample_remove, + .driver = { + .name = "qmi_sample_client", + }, +}; + +static int qmi_sample_new_server(struct qmi_handle *qmi, + struct qmi_service *service) +{ + struct platform_device *pdev; + struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port }; + int ret; + + pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO); + if (!pdev) + return -ENOMEM; + + ret = platform_device_add_data(pdev, &sq, sizeof(sq)); + if (ret) + goto err_put_device; + + ret = platform_device_add(pdev); + if (ret) + goto err_put_device; + + service->priv = pdev; + + return 0; + +err_put_device: + platform_device_put(pdev); + + return ret; +} + +static void qmi_sample_del_server(struct qmi_handle *qmi, + struct qmi_service *service) +{ + struct platform_device *pdev = service->priv; + + platform_device_unregister(pdev); +} + +static struct qmi_handle lookup_client; + +static const struct qmi_ops lookup_ops = { + .new_server = qmi_sample_new_server, + .del_server = qmi_sample_del_server, +}; + +static int qmi_sample_init(void) +{ + int ret; + + qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL); + if (IS_ERR(qmi_debug_dir)) { + pr_err("failed to create qmi_sample dir\n"); + return PTR_ERR(qmi_debug_dir); + } + + ret = platform_driver_register(&qmi_sample_driver); + if (ret) + goto err_remove_debug_dir; + + ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL); + if (ret < 0) + goto err_unregister_driver; + + qmi_add_lookup(&lookup_client, 15, 0, 0); + + return 0; + +err_unregister_driver: + platform_driver_unregister(&qmi_sample_driver); +err_remove_debug_dir: + debugfs_remove(qmi_debug_dir); + + return ret; +} + +static void qmi_sample_exit(void) +{ + qmi_handle_release(&lookup_client); + + platform_driver_unregister(&qmi_sample_driver); + + debugfs_remove(qmi_debug_dir); +} + +module_init(qmi_sample_init); +module_exit(qmi_sample_exit); + +MODULE_DESCRIPTION("Sample QMI client driver"); +MODULE_LICENSE("GPL v2"); diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile index 2d4973c69663..ddf9a5d13cad 100644 --- a/samples/rpmsg/Makefile +++ b/samples/rpmsg/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c index f161dfd3e70a..ae5081662283 100644 --- a/samples/rpmsg/rpmsg_client_sample.c +++ b/samples/rpmsg/rpmsg_client_sample.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Remote processor messaging - sample client driver * @@ -6,15 +7,6 @@ * * Ohad Ben-Cohen <ohad@wizery.com> * Brian Swetland <swetland@google.com> - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/kernel.h> @@ -22,7 +14,9 @@ #include <linux/rpmsg.h> #define MSG "hello world!" -#define MSG_LIMIT 100 + +static int count = 100; +module_param(count, int, 0644); struct instance_data { int rx_count; @@ -37,11 +31,11 @@ static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len, dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n", ++idata->rx_count, src); - print_hex_dump(KERN_DEBUG, __func__, DUMP_PREFIX_NONE, 16, 1, - data, len, true); + print_hex_dump_debug(__func__, DUMP_PREFIX_NONE, 16, 1, data, len, + true); /* samples should not live forever */ - if (idata->rx_count >= MSG_LIMIT) { + if (idata->rx_count >= count) { dev_info(&rpdev->dev, "goodbye!\n"); return 0; } diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig new file mode 100644 index 000000000000..3efa51bfc8ef --- /dev/null +++ b/samples/rust/Kconfig @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig SAMPLES_RUST + bool "Rust samples" + depends on RUST + help + You can build sample Rust kernel code here. + + If unsure, say N. + +if SAMPLES_RUST + +config SAMPLE_RUST_CONFIGFS + tristate "Configfs sample" + depends on CONFIGFS_FS + help + This option builds the Rust configfs sample. + + To compile this as a module, choose M here: + the module will be called rust_configfs. + + If unsure, say N. + +config SAMPLE_RUST_MINIMAL + tristate "Minimal" + help + This option builds the Rust minimal module sample. + + To compile this as a module, choose M here: + the module will be called rust_minimal. + + If unsure, say N. + +config SAMPLE_RUST_MISC_DEVICE + tristate "Misc device" + help + This option builds the Rust misc device. + + To compile this as a module, choose M here: + the module will be called rust_misc_device. + + If unsure, say N. + +config SAMPLE_RUST_PRINT + tristate "Printing macros" + help + This option builds the Rust printing macros sample. + + To compile this as a module, choose M here: + the module will be called rust_print. + + If unsure, say N. + +config SAMPLE_RUST_DMA + tristate "DMA Test Driver" + depends on PCI + help + This option builds the Rust DMA Test driver sample. + + To compile this as a module, choose M here: + the module will be called rust_dma. + + If unsure, say N. + +config SAMPLE_RUST_DEBUGFS + tristate "DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs. + + If unsure, say N. + +config SAMPLE_RUST_DEBUGFS_SCOPED + tristate "Scoped DebugFS Test Module" + depends on DEBUG_FS + help + This option builds the Rust Scoped DebugFS Test module sample. + + To compile this as a module, choose M here: + the module will be called rust_debugfs_scoped. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_I2C + tristate "I2C Driver" + depends on I2C=y + help + This option builds the Rust I2C driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_i2c. + + If unsure, say N. + +config SAMPLE_RUST_I2C_CLIENT + tristate "I2C Client Registration" + depends on I2C=y + help + This option builds the Rust I2C client manual creation + sample. + + To compile this as a module, choose M here: + the module will be called rust_i2c_client. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_PCI + tristate "PCI Driver" + depends on PCI + help + This option builds the Rust PCI driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_pci. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_PLATFORM + tristate "Platform Driver" + help + This option builds the Rust Platform driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_platform. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_USB + tristate "USB Driver" + depends on USB = y + help + This option builds the Rust USB driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_usb. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_FAUX + tristate "Faux Driver" + help + This option builds the Rust Faux driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_faux. + + If unsure, say N. + +config SAMPLE_RUST_DRIVER_AUXILIARY + tristate "Auxiliary Driver" + depends on PCI + select AUXILIARY_BUS + help + This option builds the Rust auxiliary driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_auxiliary. + + If unsure, say N. + +config SAMPLE_RUST_HOSTPROGS + bool "Host programs" + help + This option builds the Rust host program samples. + + If unsure, say N. + +endif # SAMPLES_RUST diff --git a/samples/rust/Makefile b/samples/rust/Makefile new file mode 100644 index 000000000000..f65885d1d62b --- /dev/null +++ b/samples/rust/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o +obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o +obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS) += rust_debugfs.o +obj-$(CONFIG_SAMPLE_RUST_DEBUGFS_SCOPED) += rust_debugfs_scoped.o +obj-$(CONFIG_SAMPLE_RUST_DMA) += rust_dma.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_I2C) += rust_driver_i2c.o +obj-$(CONFIG_SAMPLE_RUST_I2C_CLIENT) += rust_i2c_client.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_USB) += rust_driver_usb.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_FAUX) += rust_driver_faux.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_AUXILIARY) += rust_driver_auxiliary.o +obj-$(CONFIG_SAMPLE_RUST_CONFIGFS) += rust_configfs.o + +rust_print-y := rust_print_main.o rust_print_events.o + +subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS) += hostprogs diff --git a/samples/rust/hostprogs/.gitignore b/samples/rust/hostprogs/.gitignore new file mode 100644 index 000000000000..a6c173da5048 --- /dev/null +++ b/samples/rust/hostprogs/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +single diff --git a/samples/rust/hostprogs/Makefile b/samples/rust/hostprogs/Makefile new file mode 100644 index 000000000000..8ddcbd7416db --- /dev/null +++ b/samples/rust/hostprogs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +hostprogs-always-y := single + +single-rust := y diff --git a/samples/rust/hostprogs/a.rs b/samples/rust/hostprogs/a.rs new file mode 100644 index 000000000000..f7a4a3d0f4e0 --- /dev/null +++ b/samples/rust/hostprogs/a.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `a`. + +pub(crate) fn f(x: i32) { + println!("The number is {}.", x); +} diff --git a/samples/rust/hostprogs/b.rs b/samples/rust/hostprogs/b.rs new file mode 100644 index 000000000000..c1675890648f --- /dev/null +++ b/samples/rust/hostprogs/b.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `b`. + +pub(crate) const CONSTANT: i32 = 42; diff --git a/samples/rust/hostprogs/single.rs b/samples/rust/hostprogs/single.rs new file mode 100644 index 000000000000..8c48a119339a --- /dev/null +++ b/samples/rust/hostprogs/single.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample. + +mod a; +mod b; + +fn main() { + println!("Hello world!"); + + a::f(b::CONSTANT); +} diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs new file mode 100644 index 000000000000..0ccc7553ef39 --- /dev/null +++ b/samples/rust/rust_configfs.rs @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust configfs sample. + +use kernel::alloc::flags; +use kernel::c_str; +use kernel::configfs; +use kernel::configfs::configfs_attrs; +use kernel::new_mutex; +use kernel::page::PAGE_SIZE; +use kernel::prelude::*; +use kernel::sync::Mutex; + +module! { + type: RustConfigfs, + name: "rust_configfs", + authors: ["Rust for Linux Contributors"], + description: "Rust configfs sample", + license: "GPL", +} + +#[pin_data] +struct RustConfigfs { + #[pin] + config: configfs::Subsystem<Configuration>, +} + +#[pin_data] +struct Configuration { + message: &'static CStr, + #[pin] + bar: Mutex<(KBox<[u8; PAGE_SIZE]>, usize)>, +} + +impl Configuration { + fn new() -> impl PinInit<Self, Error> { + try_pin_init!(Self { + message: c_str!("Hello World\n"), + bar <- new_mutex!((KBox::new([0; PAGE_SIZE], flags::GFP_KERNEL)?, 0)), + }) + } +} + +impl kernel::InPlaceModule for RustConfigfs { + fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> { + pr_info!("Rust configfs sample (init)\n"); + + // Define a subsystem with the data type `Configuration`, two + // attributes, `message` and `bar` and child group type `Child`. `mkdir` + // in the directory representing this subsystem will create directories + // backed by the `Child` type. + let item_type = configfs_attrs! { + container: configfs::Subsystem<Configuration>, + data: Configuration, + child: Child, + attributes: [ + message: 0, + bar: 1, + ], + }; + + try_pin_init!(Self { + config <- configfs::Subsystem::new( + c_str!("rust_configfs"), item_type, Configuration::new() + ), + }) + } +} + +#[vtable] +impl configfs::GroupOperations for Configuration { + type Child = Child; + + fn make_group(&self, name: &CStr) -> Result<impl PinInit<configfs::Group<Child>, Error>> { + // Define a group with data type `Child`, one attribute `baz` and child + // group type `GrandChild`. `mkdir` in the directory representing this + // group will create directories backed by the `GrandChild` type. + let tpe = configfs_attrs! { + container: configfs::Group<Child>, + data: Child, + child: GrandChild, + attributes: [ + baz: 0, + ], + }; + + Ok(configfs::Group::new(name.try_into()?, tpe, Child::new())) + } +} + +#[vtable] +impl configfs::AttributeOperations<0> for Configuration { + type Data = Configuration; + + fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + pr_info!("Show message\n"); + let data = container.message.to_bytes(); + page[0..data.len()].copy_from_slice(data); + Ok(data.len()) + } +} + +#[vtable] +impl configfs::AttributeOperations<1> for Configuration { + type Data = Configuration; + + fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + pr_info!("Show bar\n"); + let guard = container.bar.lock(); + let data = guard.0.as_slice(); + let len = guard.1; + page[0..len].copy_from_slice(&data[0..len]); + Ok(len) + } + + fn store(container: &Configuration, page: &[u8]) -> Result { + pr_info!("Store bar\n"); + let mut guard = container.bar.lock(); + guard.0[0..page.len()].copy_from_slice(page); + guard.1 = page.len(); + Ok(()) + } +} + +// `pin_data` cannot handle structs without braces. +#[pin_data] +struct Child {} + +impl Child { + fn new() -> impl PinInit<Self, Error> { + try_pin_init!(Self {}) + } +} + +#[vtable] +impl configfs::GroupOperations for Child { + type Child = GrandChild; + + fn make_group(&self, name: &CStr) -> Result<impl PinInit<configfs::Group<GrandChild>, Error>> { + // Define a group with data type `GrandChild`, one attribute `gc`. As no + // child type is specified, it will not be possible to create subgroups + // in this group, and `mkdir`in the directory representing this group + // will return an error. + let tpe = configfs_attrs! { + container: configfs::Group<GrandChild>, + data: GrandChild, + attributes: [ + gc: 0, + ], + }; + + Ok(configfs::Group::new( + name.try_into()?, + tpe, + GrandChild::new(), + )) + } +} + +#[vtable] +impl configfs::AttributeOperations<0> for Child { + type Data = Child; + + fn show(_container: &Child, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + pr_info!("Show baz\n"); + let data = c"Hello Baz\n".to_bytes(); + page[0..data.len()].copy_from_slice(data); + Ok(data.len()) + } +} + +// `pin_data` cannot handle structs without braces. +#[pin_data] +struct GrandChild {} + +impl GrandChild { + fn new() -> impl PinInit<Self, Error> { + try_pin_init!(Self {}) + } +} + +#[vtable] +impl configfs::AttributeOperations<0> for GrandChild { + type Data = GrandChild; + + fn show(_container: &GrandChild, page: &mut [u8; PAGE_SIZE]) -> Result<usize> { + pr_info!("Show grand child\n"); + let data = c"Hello GC\n".to_bytes(); + page[0..data.len()].copy_from_slice(data); + Ok(data.len()) + } +} diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs new file mode 100644 index 000000000000..025e8f9d12de --- /dev/null +++ b/samples/rust/rust_debugfs.rs @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver +//! +//! To successfully probe this driver with ACPI, use an ssdt that looks like +//! +//! ```dsl +//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001) +//! { +//! Scope (\_SB) +//! { +//! Device (T432) +//! { +//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match +//! Name (_UID, 1) +//! Name (_STA, 0x0F) // Device present, enabled +//! Name (_DSD, Package () { // Sample attribute +//! ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), +//! Package() { +//! Package(2) {"compatible", "sample-debugfs"} +//! } +//! }) +//! Name (_CRS, ResourceTemplate () +//! { +//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000) +//! }) +//! } +//! } +//! } +//! ``` + +use core::str::FromStr; +use kernel::c_str; +use kernel::debugfs::{Dir, File}; +use kernel::new_mutex; +use kernel::prelude::*; +use kernel::sizes::*; +use kernel::sync::atomic::{Atomic, Relaxed}; +use kernel::sync::Mutex; +use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef}; + +kernel::module_platform_driver! { + type: RustDebugFs, + name: "rust_debugfs", + authors: ["Matthew Maurer"], + description: "Rust DebugFS usage sample", + license: "GPL", +} + +#[pin_data] +struct RustDebugFs { + pdev: ARef<platform::Device>, + // As we only hold these for drop effect (to remove the directory/files) we have a leading + // underscore to indicate to the compiler that we don't expect to use this field directly. + _debugfs: Dir, + #[pin] + _compatible: File<CString>, + #[pin] + counter: File<Atomic<usize>>, + #[pin] + inner: File<Mutex<Inner>>, + #[pin] + array_blob: File<Mutex<[u8; 4]>>, + #[pin] + vector_blob: File<Mutex<KVec<u8>>>, +} + +#[derive(Debug)] +struct Inner { + x: u32, + y: u32, +} + +impl FromStr for Inner { + type Err = Error; + fn from_str(s: &str) -> Result<Self> { + let mut parts = s.split_whitespace(); + let x = parts + .next() + .ok_or(EINVAL)? + .parse::<u32>() + .map_err(|_| EINVAL)?; + let y = parts + .next() + .ok_or(EINVAL)? + .parse::<u32>() + .map_err(|_| EINVAL)?; + if parts.next().is_some() { + return Err(EINVAL); + } + Ok(Inner { x, y }) + } +} + +kernel::acpi_device_table!( + ACPI_TABLE, + MODULE_ACPI_TABLE, + <RustDebugFs as platform::Driver>::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())] +); + +impl platform::Driver for RustDebugFs { + type IdInfo = (); + const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = None; + const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE); + + fn probe( + pdev: &platform::Device<Core>, + _info: Option<&Self::IdInfo>, + ) -> impl PinInit<Self, Error> { + RustDebugFs::new(pdev).pin_chain(|this| { + this.counter.store(91, Relaxed); + { + let mut guard = this.inner.lock(); + guard.x = guard.y; + guard.y = 42; + } + + Ok(()) + }) + } +} + +impl RustDebugFs { + fn build_counter(dir: &Dir) -> impl PinInit<File<Atomic<usize>>> + '_ { + dir.read_write_file(c_str!("counter"), Atomic::<usize>::new(0)) + } + + fn build_inner(dir: &Dir) -> impl PinInit<File<Mutex<Inner>>> + '_ { + dir.read_write_file(c_str!("pair"), new_mutex!(Inner { x: 3, y: 10 })) + } + + fn new(pdev: &platform::Device<Core>) -> impl PinInit<Self, Error> + '_ { + let debugfs = Dir::new(c_str!("sample_debugfs")); + let dev = pdev.as_ref(); + + try_pin_init! { + Self { + _compatible <- debugfs.read_only_file( + c_str!("compatible"), + dev.fwnode() + .ok_or(ENOENT)? + .property_read::<CString>(c_str!("compatible")) + .required_by(dev)?, + ), + counter <- Self::build_counter(&debugfs), + inner <- Self::build_inner(&debugfs), + array_blob <- debugfs.read_write_binary_file( + c_str!("array_blob"), + new_mutex!([0x62, 0x6c, 0x6f, 0x62]), + ), + vector_blob <- debugfs.read_write_binary_file( + c_str!("vector_blob"), + new_mutex!(kernel::kvec!(0x42; SZ_4K)?), + ), + _debugfs: debugfs, + pdev: pdev.into(), + } + } + } +} diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs new file mode 100644 index 000000000000..702a6546d3fb --- /dev/null +++ b/samples/rust/rust_debugfs_scoped.rs @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Sample DebugFS exporting platform driver that demonstrates the use of +//! `Scope::dir` to create a variety of files without the need to separately +//! track them all. + +use kernel::debugfs::{Dir, Scope}; +use kernel::prelude::*; +use kernel::sizes::*; +use kernel::sync::atomic::Atomic; +use kernel::sync::Mutex; +use kernel::{c_str, new_mutex, str::CString}; + +module! { + type: RustScopedDebugFs, + name: "rust_debugfs_scoped", + authors: ["Matthew Maurer"], + description: "Rust Scoped DebugFS usage sample", + license: "GPL", +} + +fn remove_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() >= buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let nul_idx = s.len(); + buf[nul_idx] = 0; + let to_remove = CStr::from_bytes_with_nul(&buf[..nul_idx + 1]).map_err(|_| EINVAL)?; + mod_data + .devices + .lock() + .retain(|device| device.name.to_bytes() != to_remove.to_bytes()); + Ok(()) +} + +fn create_file_write( + mod_data: &ModuleData, + reader: &mut kernel::uaccess::UserSliceReader, +) -> Result { + let mut buf = [0u8; 128]; + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; + + let mut nums = KVec::new(); + + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim(); + let mut items = s.split_whitespace(); + let name_str = items.next().ok_or(EINVAL)?; + let name = CString::try_from_fmt(fmt!("{name_str}"))?; + let file_name = CString::try_from_fmt(fmt!("{name_str}"))?; + for sub in items { + nums.push( + Atomic::<usize>::new(sub.parse().map_err(|_| EINVAL)?), + GFP_KERNEL, + )?; + } + let blob = KBox::pin_init(new_mutex!([0x42; SZ_4K]), GFP_KERNEL)?; + + let scope = KBox::pin_init( + mod_data.device_dir.scope( + DeviceData { name, nums, blob }, + &file_name, + |dev_data, dir| { + for (idx, val) in dev_data.nums.iter().enumerate() { + let Ok(name) = CString::try_from_fmt(fmt!("{idx}")) else { + return; + }; + dir.read_write_file(&name, val); + } + dir.read_write_binary_file(c_str!("blob"), &dev_data.blob); + }, + ), + GFP_KERNEL, + )?; + (*mod_data.devices.lock()).push(scope, GFP_KERNEL)?; + + Ok(()) +} + +struct RustScopedDebugFs { + _data: Pin<KBox<Scope<ModuleData>>>, +} + +#[pin_data] +struct ModuleData { + device_dir: Dir, + #[pin] + devices: Mutex<KVec<Pin<KBox<Scope<DeviceData>>>>>, +} + +impl ModuleData { + fn init(device_dir: Dir) -> impl PinInit<Self> { + pin_init! { + Self { + device_dir: device_dir, + devices <- new_mutex!(KVec::new()) + } + } + } +} + +struct DeviceData { + name: CString, + nums: KVec<Atomic<usize>>, + blob: Pin<KBox<Mutex<[u8; SZ_4K]>>>, +} + +fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit<Scope<ModuleData>> + '_ { + base_dir.scope( + ModuleData::init(dyn_dirs), + c_str!("control"), + |data, dir| { + dir.write_only_callback_file(c_str!("create"), data, &create_file_write); + dir.write_only_callback_file(c_str!("remove"), data, &remove_file_write); + }, + ) +} + +impl kernel::Module for RustScopedDebugFs { + fn init(_module: &'static kernel::ThisModule) -> Result<Self> { + let base_dir = Dir::new(c_str!("rust_scoped_debugfs")); + let dyn_dirs = base_dir.subdir(c_str!("dynamic")); + Ok(Self { + _data: KBox::pin_init(init_control(&base_dir, dyn_dirs), GFP_KERNEL)?, + }) + } +} diff --git a/samples/rust/rust_dma.rs b/samples/rust/rust_dma.rs new file mode 100644 index 000000000000..f53bce2a73e3 --- /dev/null +++ b/samples/rust/rust_dma.rs @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust DMA api test (based on QEMU's `pci-testdev`). +//! +//! To make this driver probe, QEMU must be run with `-device pci-testdev`. + +use kernel::{ + device::Core, + dma::{CoherentAllocation, DataDirection, Device, DmaMask}, + page, pci, + prelude::*, + scatterlist::{Owned, SGTable}, + sync::aref::ARef, +}; + +#[pin_data(PinnedDrop)] +struct DmaSampleDriver { + pdev: ARef<pci::Device>, + ca: CoherentAllocation<MyStruct>, + #[pin] + sgt: SGTable<Owned<VVec<u8>>>, +} + +const TEST_VALUES: [(u32, u32); 5] = [ + (0xa, 0xb), + (0xc, 0xd), + (0xe, 0xf), + (0xab, 0xba), + (0xcd, 0xef), +]; + +struct MyStruct { + h: u32, + b: u32, +} + +impl MyStruct { + fn new(h: u32, b: u32) -> Self { + Self { h, b } + } +} +// SAFETY: All bit patterns are acceptable values for `MyStruct`. +unsafe impl kernel::transmute::AsBytes for MyStruct {} +// SAFETY: Instances of `MyStruct` have no uninitialized portions. +unsafe impl kernel::transmute::FromBytes for MyStruct {} + +kernel::pci_device_table!( + PCI_TABLE, + MODULE_PCI_TABLE, + <DmaSampleDriver as pci::Driver>::IdInfo, + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] +); + +impl pci::Driver for DmaSampleDriver { + type IdInfo = (); + const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE; + + fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> { + pin_init::pin_init_scope(move || { + dev_info!(pdev.as_ref(), "Probe DMA test driver.\n"); + + let mask = DmaMask::new::<64>(); + + // SAFETY: There are no concurrent calls to DMA allocation and mapping primitives. + unsafe { pdev.dma_set_mask_and_coherent(mask)? }; + + let ca: CoherentAllocation<MyStruct> = + CoherentAllocation::alloc_coherent(pdev.as_ref(), TEST_VALUES.len(), GFP_KERNEL)?; + + for (i, value) in TEST_VALUES.into_iter().enumerate() { + kernel::dma_write!(ca[i] = MyStruct::new(value.0, value.1))?; + } + + let size = 4 * page::PAGE_SIZE; + let pages = VVec::with_capacity(size, GFP_KERNEL)?; + + let sgt = SGTable::new(pdev.as_ref(), pages, DataDirection::ToDevice, GFP_KERNEL); + + Ok(try_pin_init!(Self { + pdev: pdev.into(), + ca, + sgt <- sgt, + })) + }) + } +} + +#[pinned_drop] +impl PinnedDrop for DmaSampleDriver { + fn drop(self: Pin<&mut Self>) { + let dev = self.pdev.as_ref(); + + dev_info!(dev, "Unload DMA test driver.\n"); + + for (i, value) in TEST_VALUES.into_iter().enumerate() { + let val0 = kernel::dma_read!(self.ca[i].h); + let val1 = kernel::dma_read!(self.ca[i].b); + assert!(val0.is_ok()); + assert!(val1.is_ok()); + + if let Ok(val0) = val0 { + assert_eq!(val0, value.0); + } + if let Ok(val1) = val1 { + assert_eq!(val1, value.1); + } + } + + for (i, entry) in self.sgt.iter().enumerate() { + dev_info!(dev, "Entry[{}]: DMA address: {:#x}", i, entry.dma_address()); + } + } +} + +kernel::module_pci_driver! { + type: DmaSampleDriver, + name: "rust_dma", + authors: ["Abdiel Janulgue"], + description: "Rust DMA test", + license: "GPL v2", +} diff --git a/samples/rust/rust_driver_auxiliary.rs b/samples/rust/rust_driver_auxiliary.rs new file mode 100644 index 000000000000..5761ea314f44 --- /dev/null +++ b/samples/rust/rust_driver_auxiliary.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust auxiliary driver sample (based on a PCI driver for QEMU's `pci-testdev`). +//! +//! To make this driver probe, QEMU must be run with `-device pci-testdev`. + +use kernel::{ + auxiliary, c_str, + device::{Bound, Core}, + devres::Devres, + driver, + error::Error, + pci, + prelude::*, + InPlaceModule, +}; + +use core::any::TypeId; +use pin_init::PinInit; + +const MODULE_NAME: &CStr = <LocalModule as kernel::ModuleMetadata>::NAME; +const AUXILIARY_NAME: &CStr = c_str!("auxiliary"); + +struct AuxiliaryDriver; + +kernel::auxiliary_device_table!( + AUX_TABLE, + MODULE_AUX_TABLE, + <AuxiliaryDriver as auxiliary::Driver>::IdInfo, + [(auxiliary::DeviceId::new(MODULE_NAME, AUXILIARY_NAME), ())] +); + +impl auxiliary::Driver for AuxiliaryDriver { + type IdInfo = (); + + const ID_TABLE: auxiliary::IdTable<Self::IdInfo> = &AUX_TABLE; + + fn probe(adev: &auxiliary::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> { + dev_info!( + adev.as_ref(), + "Probing auxiliary driver for auxiliary device with id={}\n", + adev.id() + ); + + ParentDriver::connect(adev)?; + + Ok(Self) + } +} + +#[pin_data] +struct ParentDriver { + private: TypeId, + #[pin] + _reg0: Devres<auxiliary::Registration>, + #[pin] + _reg1: Devres<auxiliary::Registration>, +} + +kernel::pci_device_table!( + PCI_TABLE, + MODULE_PCI_TABLE, + <ParentDriver as pci::Driver>::IdInfo, + [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())] +); + +impl pci::Driver for ParentDriver { + type IdInfo = (); + + const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE; + + fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> { + try_pin_init!(Self { + private: TypeId::of::<Self>(), + _reg0 <- auxiliary::Registration::new(pdev.as_ref(), AUXILIARY_NAME, 0, MODULE_NAME), + _reg1 <- auxiliary::Registration::new(pdev.as_ref(), AUXILIARY_NAME, 1, MODULE_NAME), + }) + } +} + +impl ParentDriver { + fn connect(adev: &auxiliary::Device<Bound>) -> Result { + let dev = adev.parent(); + let pdev: &pci::Device<Bound> = dev.try_into()?; + let drvdata = dev.drvdata::<Self>()?; + + dev_info!( + dev, + "Connect auxiliary {} with parent: VendorID={}, DeviceID={:#x}\n", + adev.id(), + pdev.vendor_id(), + pdev.device_id() + ); + + dev_info!( + dev, + "We have access to the private data of {:?}.\n", + drvdata.private + ); + + Ok(()) + } +} + +#[pin_data] +struct SampleModule { + #[pin] + _pci_driver: driver::Registration<pci::Adapter<ParentDriver>>, + #[pin] + _aux_driver: driver::Registration<auxiliary::Adapter<AuxiliaryDriver>>, +} + +impl InPlaceModule for SampleModule { + fn init(module: &'static kernel::ThisModule) -> impl PinInit<Self, Error> { + try_pin_init!(Self { + _pci_driver <- driver::Registration::new(MODULE_NAME, module), + _aux_driver <- driver::Registration::new(MODULE_NAME, module), + }) + } +} + +module! { + type: SampleModule, + name: "rust_driver_auxiliary", + authors: ["Danilo Krummrich"], + description: "Rust auxiliary driver", + license: "GPL v2", +} diff --git a/samples/rust/rust_driver_faux.rs b/samples/rust/rust_driver_faux.rs new file mode 100644 index 000000000000..ecc9fd378cbd --- /dev/null +++ b/samples/rust/rust_driver_faux.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +//! Rust faux device sample. + +use kernel::{c_str, faux, prelude::*, Module}; + +module! { + type: SampleModule, + name: "rust_faux_driver", + authors: ["Lyude Paul"], + description: "Rust faux device sample", + license: "GPL", +} + +struct SampleModule { + _reg: faux::Registration, +} + +impl Module for SampleModule { + fn init(_module: &'static ThisModule) -> Result<Self> { + pr_info!("Initialising Rust Faux Device Sample\n"); + + let reg = faux::Registration::new(c_str!("rust-faux-sample-device"), None)?; + + dev_info!(reg.as_ref(), "Hello from faux device!\n"); + + Ok(Self { _reg: reg }) + } +} diff --git a/samples/rust/rust_driver_i2c.rs b/samples/rust/rust_driver_i2c.rs new file mode 100644 index 000000000000..ecefeca3e22f --- /dev/null +++ b/samples/rust/rust_driver_i2c.rs @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust I2C driver sample. + +use kernel::{ + acpi, + c_str, + device::Core, + i2c, + of, + prelude::*, // +}; + +struct SampleDriver; + +kernel::acpi_device_table! { + ACPI_TABLE, + MODULE_ACPI_TABLE, + <SampleDriver as i2c::Driver>::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), 0)] +} + +kernel::i2c_device_table! { + I2C_TABLE, + MODULE_I2C_TABLE, + <SampleDriver as i2c::Driver>::IdInfo, + [(i2c::DeviceId::new(c_str!("rust_driver_i2c")), 0)] +} + +kernel::of_device_table! { + OF_TABLE, + MODULE_OF_TABLE, + <SampleDriver as i2c::Driver>::IdInfo, + [(of::DeviceId::new(c_str!("test,rust_driver_i2c")), 0)] +} + +impl i2c::Driver for SampleDriver { + type IdInfo = u32; + + const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE); + const I2C_ID_TABLE: Option<i2c::IdTable<Self::IdInfo>> = Some(&I2C_TABLE); + const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE); + + fn probe( + idev: &i2c::I2cClient<Core>, + info: Option<&Self::IdInfo>, + ) -> impl PinInit<Self, Error> { + let dev = idev.as_ref(); + + dev_info!(dev, "Probe Rust I2C driver sample.\n"); + + if let Some(info) = info { + dev_info!(dev, "Probed with info: '{}'.\n", info); + } + + Ok(Self) + } + + fn shutdown(idev: &i2c::I2cClient<Core>, _this: Pin<&Self>) { + dev_info!(idev.as_ref(), "Shutdown Rust I2C driver sample.\n"); + } + + fn unbind(idev: &i2c::I2cClient<Core>, _this: Pin<&Self>) { + dev_info!(idev.as_ref(), "Unbind Rust I2C driver sample.\n"); + } +} + +kernel::module_i2c_driver! { + type: SampleDriver, + name: "rust_driver_i2c", + authors: ["Igor Korotin"], + description: "Rust I2C driver", + license: "GPL v2", +} diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs new file mode 100644 index 000000000000..5823787bea8e --- /dev/null +++ b/samples/rust/rust_driver_pci.rs @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust PCI driver sample (based on QEMU's `pci-testdev`). +//! +//! To make this driver probe, QEMU must be run with `-device pci-testdev`. + +use kernel::{c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef}; + +struct Regs; + +impl Regs { + const TEST: usize = 0x0; + const OFFSET: usize = 0x4; + const DATA: usize = 0x8; + const COUNT: usize = 0xC; + const END: usize = 0x10; +} + +type Bar0 = pci::Bar<{ Regs::END }>; + +#[derive(Copy, Clone, Debug)] +struct TestIndex(u8); + +impl TestIndex { + const NO_EVENTFD: Self = Self(0); +} + +#[pin_data(PinnedDrop)] +struct SampleDriver { + pdev: ARef<pci::Device>, + #[pin] + bar: Devres<Bar0>, + index: TestIndex, +} + +kernel::pci_device_table!( + PCI_TABLE, + MODULE_PCI_TABLE, + <SampleDriver as pci::Driver>::IdInfo, + [( + pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), + TestIndex::NO_EVENTFD + )] +); + +impl SampleDriver { + fn testdev(index: &TestIndex, bar: &Bar0) -> Result<u32> { + // Select the test. + bar.write8(index.0, Regs::TEST); + + let offset = u32::from_le(bar.read32(Regs::OFFSET)) as usize; + let data = bar.read8(Regs::DATA); + + // Write `data` to `offset` to increase `count` by one. + // + // Note that we need `try_write8`, since `offset` can't be checked at compile-time. + bar.try_write8(data, offset)?; + + Ok(bar.read32(Regs::COUNT)) + } +} + +impl pci::Driver for SampleDriver { + type IdInfo = TestIndex; + + const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE; + + fn probe(pdev: &pci::Device<Core>, info: &Self::IdInfo) -> impl PinInit<Self, Error> { + pin_init::pin_init_scope(move || { + let vendor = pdev.vendor_id(); + dev_dbg!( + pdev.as_ref(), + "Probe Rust PCI driver sample (PCI ID: {}, 0x{:x}).\n", + vendor, + pdev.device_id() + ); + + pdev.enable_device_mem()?; + pdev.set_master(); + + Ok(try_pin_init!(Self { + bar <- pdev.iomap_region_sized::<{ Regs::END }>(0, c_str!("rust_driver_pci")), + index: *info, + _: { + let bar = bar.access(pdev.as_ref())?; + + dev_info!( + pdev.as_ref(), + "pci-testdev data-match count: {}\n", + Self::testdev(info, bar)? + ); + }, + pdev: pdev.into(), + })) + }) + } + + fn unbind(pdev: &pci::Device<Core>, this: Pin<&Self>) { + if let Ok(bar) = this.bar.access(pdev.as_ref()) { + // Reset pci-testdev by writing a new test index. + bar.write8(this.index.0, Regs::TEST); + } + } +} + +#[pinned_drop] +impl PinnedDrop for SampleDriver { + fn drop(self: Pin<&mut Self>) { + dev_dbg!(self.pdev.as_ref(), "Remove Rust PCI driver sample.\n"); + } +} + +kernel::module_pci_driver! { + type: SampleDriver, + name: "rust_driver_pci", + authors: ["Danilo Krummrich"], + description: "Rust PCI driver", + license: "GPL v2", +} diff --git a/samples/rust/rust_driver_platform.rs b/samples/rust/rust_driver_platform.rs new file mode 100644 index 000000000000..6bf4f0c9633d --- /dev/null +++ b/samples/rust/rust_driver_platform.rs @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust Platform driver sample. + +//! ACPI match table test +//! +//! This demonstrates how to test an ACPI-based Rust platform driver using QEMU +//! with a custom SSDT. +//! +//! Steps: +//! +//! 1. **Create an SSDT source file** (`ssdt.dsl`) with the following content: +//! +//! ```asl +//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001) +//! { +//! Scope (\_SB) +//! { +//! Device (T432) +//! { +//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match +//! Name (_UID, 1) +//! Name (_STA, 0x0F) // Device present, enabled +//! Name (_CRS, ResourceTemplate () +//! { +//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000) +//! }) +//! } +//! } +//! } +//! ``` +//! +//! 2. **Compile the table**: +//! +//! ```sh +//! iasl -tc ssdt.dsl +//! ``` +//! +//! This generates `ssdt.aml` +//! +//! 3. **Run QEMU** with the compiled AML file: +//! +//! ```sh +//! qemu-system-x86_64 -m 512M \ +//! -enable-kvm \ +//! -kernel path/to/bzImage \ +//! -append "root=/dev/sda console=ttyS0" \ +//! -hda rootfs.img \ +//! -serial stdio \ +//! -acpitable file=ssdt.aml +//! ``` +//! +//! Requirements: +//! - The `rust_driver_platform` must be present either: +//! - built directly into the kernel (`bzImage`), or +//! - available as a `.ko` file and loadable from `rootfs.img` +//! +//! 4. **Verify it worked** by checking `dmesg`: +//! +//! ``` +//! rust_driver_platform LNUXBEEF:00: Probed with info: '0'. +//! ``` +//! + +use kernel::{ + acpi, c_str, + device::{ + self, + property::{FwNodeReferenceArgs, NArgs}, + Core, + }, + of, platform, + prelude::*, + str::CString, + sync::aref::ARef, +}; + +struct SampleDriver { + pdev: ARef<platform::Device>, +} + +struct Info(u32); + +kernel::of_device_table!( + OF_TABLE, + MODULE_OF_TABLE, + <SampleDriver as platform::Driver>::IdInfo, + [(of::DeviceId::new(c_str!("test,rust-device")), Info(42))] +); + +kernel::acpi_device_table!( + ACPI_TABLE, + MODULE_ACPI_TABLE, + <SampleDriver as platform::Driver>::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), Info(0))] +); + +impl platform::Driver for SampleDriver { + type IdInfo = Info; + const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE); + const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE); + + fn probe( + pdev: &platform::Device<Core>, + info: Option<&Self::IdInfo>, + ) -> impl PinInit<Self, Error> { + let dev = pdev.as_ref(); + + dev_dbg!(dev, "Probe Rust Platform driver sample.\n"); + + if let Some(info) = info { + dev_info!(dev, "Probed with info: '{}'.\n", info.0); + } + + if dev.fwnode().is_some_and(|node| node.is_of_node()) { + Self::properties_parse(dev)?; + } + + Ok(Self { pdev: pdev.into() }) + } +} + +impl SampleDriver { + fn properties_parse(dev: &device::Device) -> Result { + let fwnode = dev.fwnode().ok_or(ENOENT)?; + + if let Ok(idx) = + fwnode.property_match_string(c_str!("compatible"), c_str!("test,rust-device")) + { + dev_info!(dev, "matched compatible string idx = {}\n", idx); + } + + let name = c_str!("compatible"); + let prop = fwnode.property_read::<CString>(name).required_by(dev)?; + dev_info!(dev, "'{name}'='{prop:?}'\n"); + + let name = c_str!("test,bool-prop"); + let prop = fwnode.property_read_bool(c_str!("test,bool-prop")); + dev_info!(dev, "'{name}'='{prop}'\n"); + + if fwnode.property_present(c_str!("test,u32-prop")) { + dev_info!(dev, "'test,u32-prop' is present\n"); + } + + let name = c_str!("test,u32-optional-prop"); + let prop = fwnode.property_read::<u32>(name).or(0x12); + dev_info!(dev, "'{name}'='{prop:#x}' (default = 0x12)\n"); + + // A missing required property will print an error. Discard the error to + // prevent properties_parse from failing in that case. + let name = c_str!("test,u32-required-prop"); + let _ = fwnode.property_read::<u32>(name).required_by(dev); + + let name = c_str!("test,u32-prop"); + let prop: u32 = fwnode.property_read(name).required_by(dev)?; + dev_info!(dev, "'{name}'='{prop:#x}'\n"); + + let name = c_str!("test,i16-array"); + let prop: [i16; 4] = fwnode.property_read(name).required_by(dev)?; + dev_info!(dev, "'{name}'='{prop:?}'\n"); + let len = fwnode.property_count_elem::<u16>(name)?; + dev_info!(dev, "'{name}' length is {len}\n"); + + let name = c_str!("test,i16-array"); + let prop: KVec<i16> = fwnode.property_read_array_vec(name, 4)?.required_by(dev)?; + dev_info!(dev, "'{name}'='{prop:?}' (KVec)\n"); + + for child in fwnode.children() { + let name = c_str!("test,ref-arg"); + let nargs = NArgs::N(2); + let prop: FwNodeReferenceArgs = child.property_get_reference_args(name, nargs, 0)?; + dev_info!(dev, "'{name}'='{prop:?}'\n"); + } + + Ok(()) + } +} + +impl Drop for SampleDriver { + fn drop(&mut self) { + dev_dbg!(self.pdev.as_ref(), "Remove Rust Platform driver sample.\n"); + } +} + +kernel::module_platform_driver! { + type: SampleDriver, + name: "rust_driver_platform", + authors: ["Danilo Krummrich"], + description: "Rust Platform driver", + license: "GPL v2", +} diff --git a/samples/rust/rust_driver_usb.rs b/samples/rust/rust_driver_usb.rs new file mode 100644 index 000000000000..4eaad14867b2 --- /dev/null +++ b/samples/rust/rust_driver_usb.rs @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +// SPDX-FileCopyrightText: Copyright (C) 2025 Collabora Ltd. + +//! Rust USB driver sample. + +use kernel::{device, device::Core, prelude::*, sync::aref::ARef, usb}; + +struct SampleDriver { + _intf: ARef<usb::Interface>, +} + +kernel::usb_device_table!( + USB_TABLE, + MODULE_USB_TABLE, + <SampleDriver as usb::Driver>::IdInfo, + [(usb::DeviceId::from_id(0x1234, 0x5678), ()),] +); + +impl usb::Driver for SampleDriver { + type IdInfo = (); + const ID_TABLE: usb::IdTable<Self::IdInfo> = &USB_TABLE; + + fn probe( + intf: &usb::Interface<Core>, + _id: &usb::DeviceId, + _info: &Self::IdInfo, + ) -> impl PinInit<Self, Error> { + let dev: &device::Device<Core> = intf.as_ref(); + dev_info!(dev, "Rust USB driver sample probed\n"); + + Ok(Self { _intf: intf.into() }) + } + + fn disconnect(intf: &usb::Interface<Core>, _data: Pin<&Self>) { + let dev: &device::Device<Core> = intf.as_ref(); + dev_info!(dev, "Rust USB driver sample disconnected\n"); + } +} + +kernel::module_usb_driver! { + type: SampleDriver, + name: "rust_driver_usb", + authors: ["Daniel Almeida"], + description: "Rust USB driver sample", + license: "GPL v2", +} diff --git a/samples/rust/rust_i2c_client.rs b/samples/rust/rust_i2c_client.rs new file mode 100644 index 000000000000..f67938396dce --- /dev/null +++ b/samples/rust/rust_i2c_client.rs @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust I2C client registration sample. +//! +//! An I2C client in Rust cannot exist on its own. To register a new I2C client, +//! it must be bound to a parent device. In this sample driver, a platform device +//! is used as the parent. +//! + +//! ACPI match table test +//! +//! This demonstrates how to test an ACPI-based Rust I2C client registration driver +//! using QEMU with a custom SSDT. +//! +//! Steps: +//! +//! 1. **Create an SSDT source file** (`ssdt.dsl`) with the following content: +//! +//! ```asl +//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001) +//! { +//! Scope (\_SB) +//! { +//! Device (T432) +//! { +//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match +//! Name (_UID, 1) +//! Name (_STA, 0x0F) // Device present, enabled +//! Name (_CRS, ResourceTemplate () +//! { +//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000) +//! }) +//! } +//! } +//! } +//! ``` +//! +//! 2. **Compile the table**: +//! +//! ```sh +//! iasl -tc ssdt.dsl +//! ``` +//! +//! This generates `ssdt.aml` +//! +//! 3. **Run QEMU** with the compiled AML file: +//! +//! ```sh +//! qemu-system-x86_64 -m 512M \ +//! -enable-kvm \ +//! -kernel path/to/bzImage \ +//! -append "root=/dev/sda console=ttyS0" \ +//! -hda rootfs.img \ +//! -serial stdio \ +//! -acpitable file=ssdt.aml +//! ``` +//! +//! Requirements: +//! - The `rust_driver_platform` must be present either: +//! - built directly into the kernel (`bzImage`), or +//! - available as a `.ko` file and loadable from `rootfs.img` +//! +//! 4. **Verify it worked** by checking `dmesg`: +//! +//! ``` +//! rust_driver_platform LNUXBEEF:00: Probed with info: '0'. +//! ``` +//! + +use kernel::{ + acpi, + c_str, + device, + devres::Devres, + i2c, + of, + platform, + prelude::*, + sync::aref::ARef, // +}; + +#[pin_data] +struct SampleDriver { + parent_dev: ARef<platform::Device>, + #[pin] + _reg: Devres<i2c::Registration>, +} + +kernel::of_device_table!( + OF_TABLE, + MODULE_OF_TABLE, + <SampleDriver as platform::Driver>::IdInfo, + [(of::DeviceId::new(c_str!("test,rust-device")), ())] +); + +kernel::acpi_device_table!( + ACPI_TABLE, + MODULE_ACPI_TABLE, + <SampleDriver as platform::Driver>::IdInfo, + [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())] +); + +const SAMPLE_I2C_CLIENT_ADDR: u16 = 0x30; +const SAMPLE_I2C_ADAPTER_INDEX: i32 = 0; +const BOARD_INFO: i2c::I2cBoardInfo = + i2c::I2cBoardInfo::new(c_str!("rust_driver_i2c"), SAMPLE_I2C_CLIENT_ADDR); + +impl platform::Driver for SampleDriver { + type IdInfo = (); + const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE); + const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE); + + fn probe( + pdev: &platform::Device<device::Core>, + _info: Option<&Self::IdInfo>, + ) -> impl PinInit<Self, Error> { + dev_info!( + pdev.as_ref(), + "Probe Rust I2C Client registration sample.\n" + ); + + kernel::try_pin_init!( Self { + parent_dev: pdev.into(), + + _reg <- { + let adapter = i2c::I2cAdapter::get(SAMPLE_I2C_ADAPTER_INDEX)?; + + i2c::Registration::new(&adapter, &BOARD_INFO, pdev.as_ref()) + } + }) + } + + fn unbind(pdev: &platform::Device<device::Core>, _this: Pin<&Self>) { + dev_info!( + pdev.as_ref(), + "Unbind Rust I2C Client registration sample.\n" + ); + } +} + +kernel::module_platform_driver! { + type: SampleDriver, + name: "rust_device_i2c", + authors: ["Danilo Krummrich", "Igor Korotin"], + description: "Rust I2C client registration", + license: "GPL v2", +} diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs new file mode 100644 index 000000000000..8eb9583571d7 --- /dev/null +++ b/samples/rust/rust_minimal.rs @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust minimal sample. + +use kernel::prelude::*; + +module! { + type: RustMinimal, + name: "rust_minimal", + authors: ["Rust for Linux Contributors"], + description: "Rust minimal sample", + license: "GPL", + params: { + test_parameter: i64 { + default: 1, + description: "This parameter has a default of 1", + }, + }, +} + +struct RustMinimal { + numbers: KVec<i32>, +} + +impl kernel::Module for RustMinimal { + fn init(_module: &'static ThisModule) -> Result<Self> { + pr_info!("Rust minimal sample (init)\n"); + pr_info!("Am I built-in? {}\n", !cfg!(MODULE)); + pr_info!( + "test_parameter: {}\n", + *module_parameters::test_parameter.value() + ); + + let mut numbers = KVec::new(); + numbers.push(72, GFP_KERNEL)?; + numbers.push(108, GFP_KERNEL)?; + numbers.push(200, GFP_KERNEL)?; + + Ok(RustMinimal { numbers }) + } +} + +impl Drop for RustMinimal { + fn drop(&mut self) { + pr_info!("My numbers are {:?}\n", self.numbers); + pr_info!("Rust minimal sample (exit)\n"); + } +} diff --git a/samples/rust/rust_misc_device.rs b/samples/rust/rust_misc_device.rs new file mode 100644 index 000000000000..d69bc33dbd99 --- /dev/null +++ b/samples/rust/rust_misc_device.rs @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Rust misc device sample. +//! +//! Below is an example userspace C program that exercises this sample's functionality. +//! +//! ```c +//! #include <stdio.h> +//! #include <stdlib.h> +//! #include <errno.h> +//! #include <fcntl.h> +//! #include <unistd.h> +//! #include <sys/ioctl.h> +//! +//! #define RUST_MISC_DEV_FAIL _IO('|', 0) +//! #define RUST_MISC_DEV_HELLO _IO('|', 0x80) +//! #define RUST_MISC_DEV_GET_VALUE _IOR('|', 0x81, int) +//! #define RUST_MISC_DEV_SET_VALUE _IOW('|', 0x82, int) +//! +//! int main() { +//! int value, new_value; +//! int fd, ret; +//! +//! // Open the device file +//! printf("Opening /dev/rust-misc-device for reading and writing\n"); +//! fd = open("/dev/rust-misc-device", O_RDWR); +//! if (fd < 0) { +//! perror("open"); +//! return errno; +//! } +//! +//! // Make call into driver to say "hello" +//! printf("Calling Hello\n"); +//! ret = ioctl(fd, RUST_MISC_DEV_HELLO, NULL); +//! if (ret < 0) { +//! perror("ioctl: Failed to call into Hello"); +//! close(fd); +//! return errno; +//! } +//! +//! // Get initial value +//! printf("Fetching initial value\n"); +//! ret = ioctl(fd, RUST_MISC_DEV_GET_VALUE, &value); +//! if (ret < 0) { +//! perror("ioctl: Failed to fetch the initial value"); +//! close(fd); +//! return errno; +//! } +//! +//! value++; +//! +//! // Set value to something different +//! printf("Submitting new value (%d)\n", value); +//! ret = ioctl(fd, RUST_MISC_DEV_SET_VALUE, &value); +//! if (ret < 0) { +//! perror("ioctl: Failed to submit new value"); +//! close(fd); +//! return errno; +//! } +//! +//! // Ensure new value was applied +//! printf("Fetching new value\n"); +//! ret = ioctl(fd, RUST_MISC_DEV_GET_VALUE, &new_value); +//! if (ret < 0) { +//! perror("ioctl: Failed to fetch the new value"); +//! close(fd); +//! return errno; +//! } +//! +//! if (value != new_value) { +//! printf("Failed: Committed and retrieved values are different (%d - %d)\n", value, new_value); +//! close(fd); +//! return -1; +//! } +//! +//! // Call the unsuccessful ioctl +//! printf("Attempting to call in to an non-existent IOCTL\n"); +//! ret = ioctl(fd, RUST_MISC_DEV_FAIL, NULL); +//! if (ret < 0) { +//! perror("ioctl: Succeeded to fail - this was expected"); +//! } else { +//! printf("ioctl: Failed to fail\n"); +//! close(fd); +//! return -1; +//! } +//! +//! // Close the device file +//! printf("Closing /dev/rust-misc-device\n"); +//! close(fd); +//! +//! printf("Success\n"); +//! return 0; +//! } +//! ``` + +use core::pin::Pin; + +use kernel::{ + c_str, + device::Device, + fs::{File, Kiocb}, + ioctl::{_IO, _IOC_SIZE, _IOR, _IOW}, + iov::{IovIterDest, IovIterSource}, + miscdevice::{MiscDevice, MiscDeviceOptions, MiscDeviceRegistration}, + new_mutex, + prelude::*, + sync::{aref::ARef, Mutex}, + uaccess::{UserSlice, UserSliceReader, UserSliceWriter}, +}; + +const RUST_MISC_DEV_HELLO: u32 = _IO('|' as u32, 0x80); +const RUST_MISC_DEV_GET_VALUE: u32 = _IOR::<i32>('|' as u32, 0x81); +const RUST_MISC_DEV_SET_VALUE: u32 = _IOW::<i32>('|' as u32, 0x82); + +module! { + type: RustMiscDeviceModule, + name: "rust_misc_device", + authors: ["Lee Jones"], + description: "Rust misc device sample", + license: "GPL", +} + +#[pin_data] +struct RustMiscDeviceModule { + #[pin] + _miscdev: MiscDeviceRegistration<RustMiscDevice>, +} + +impl kernel::InPlaceModule for RustMiscDeviceModule { + fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> { + pr_info!("Initialising Rust Misc Device Sample\n"); + + let options = MiscDeviceOptions { + name: c_str!("rust-misc-device"), + }; + + try_pin_init!(Self { + _miscdev <- MiscDeviceRegistration::register(options), + }) + } +} + +struct Inner { + value: i32, + buffer: KVVec<u8>, +} + +#[pin_data(PinnedDrop)] +struct RustMiscDevice { + #[pin] + inner: Mutex<Inner>, + dev: ARef<Device>, +} + +#[vtable] +impl MiscDevice for RustMiscDevice { + type Ptr = Pin<KBox<Self>>; + + fn open(_file: &File, misc: &MiscDeviceRegistration<Self>) -> Result<Pin<KBox<Self>>> { + let dev = ARef::from(misc.device()); + + dev_info!(dev, "Opening Rust Misc Device Sample\n"); + + KBox::try_pin_init( + try_pin_init! { + RustMiscDevice { + inner <- new_mutex!(Inner { + value: 0_i32, + buffer: KVVec::new(), + }), + dev: dev, + } + }, + GFP_KERNEL, + ) + } + + fn read_iter(mut kiocb: Kiocb<'_, Self::Ptr>, iov: &mut IovIterDest<'_>) -> Result<usize> { + let me = kiocb.file(); + dev_info!(me.dev, "Reading from Rust Misc Device Sample\n"); + + let inner = me.inner.lock(); + // Read the buffer contents, taking the file position into account. + let read = iov.simple_read_from_buffer(kiocb.ki_pos_mut(), &inner.buffer)?; + + Ok(read) + } + + fn write_iter(mut kiocb: Kiocb<'_, Self::Ptr>, iov: &mut IovIterSource<'_>) -> Result<usize> { + let me = kiocb.file(); + dev_info!(me.dev, "Writing to Rust Misc Device Sample\n"); + + let mut inner = me.inner.lock(); + + // Replace buffer contents. + inner.buffer.clear(); + let len = iov.copy_from_iter_vec(&mut inner.buffer, GFP_KERNEL)?; + + // Set position to zero so that future `read` calls will see the new contents. + *kiocb.ki_pos_mut() = 0; + + Ok(len) + } + + fn ioctl(me: Pin<&RustMiscDevice>, _file: &File, cmd: u32, arg: usize) -> Result<isize> { + dev_info!(me.dev, "IOCTLing Rust Misc Device Sample\n"); + + // Treat the ioctl argument as a user pointer. + let arg = UserPtr::from_addr(arg); + let size = _IOC_SIZE(cmd); + + match cmd { + RUST_MISC_DEV_GET_VALUE => me.get_value(UserSlice::new(arg, size).writer())?, + RUST_MISC_DEV_SET_VALUE => me.set_value(UserSlice::new(arg, size).reader())?, + RUST_MISC_DEV_HELLO => me.hello()?, + _ => { + dev_err!(me.dev, "-> IOCTL not recognised: {}\n", cmd); + return Err(ENOTTY); + } + }; + + Ok(0) + } +} + +#[pinned_drop] +impl PinnedDrop for RustMiscDevice { + fn drop(self: Pin<&mut Self>) { + dev_info!(self.dev, "Exiting the Rust Misc Device Sample\n"); + } +} + +impl RustMiscDevice { + fn set_value(&self, mut reader: UserSliceReader) -> Result<isize> { + let new_value = reader.read::<i32>()?; + let mut guard = self.inner.lock(); + + dev_info!( + self.dev, + "-> Copying data from userspace (value: {})\n", + new_value + ); + + guard.value = new_value; + Ok(0) + } + + fn get_value(&self, mut writer: UserSliceWriter) -> Result<isize> { + let guard = self.inner.lock(); + let value = guard.value; + + // Free-up the lock and use our locally cached instance from here + drop(guard); + + dev_info!( + self.dev, + "-> Copying data to userspace (value: {})\n", + &value + ); + + writer.write::<i32>(&value)?; + Ok(0) + } + + fn hello(&self) -> Result<isize> { + dev_info!(self.dev, "-> Hello from the Rust Misc Device\n"); + + Ok(0) + } +} diff --git a/samples/rust/rust_print_events.c b/samples/rust/rust_print_events.c new file mode 100644 index 000000000000..a9169ff0edf1 --- /dev/null +++ b/samples/rust/rust_print_events.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Google LLC + */ + +#define CREATE_TRACE_POINTS +#define CREATE_RUST_TRACE_POINTS +#include <trace/events/rust_sample.h> diff --git a/samples/rust/rust_print_main.rs b/samples/rust/rust_print_main.rs new file mode 100644 index 000000000000..4095c72afeab --- /dev/null +++ b/samples/rust/rust_print_main.rs @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust printing macros sample. + +use kernel::pr_cont; +use kernel::prelude::*; + +module! { + type: RustPrint, + name: "rust_print", + authors: ["Rust for Linux Contributors"], + description: "Rust printing macros sample", + license: "GPL", +} + +struct RustPrint; + +#[expect(clippy::disallowed_macros)] +fn arc_print() -> Result { + use kernel::sync::*; + + let a = Arc::new(1, GFP_KERNEL)?; + let b = UniqueArc::new("hello, world", GFP_KERNEL)?; + + // Prints the value of data in `a`. + pr_info!("{}", a); + + // Uses ":?" to print debug fmt of `b`. + pr_info!("{:?}", b); + + let a: Arc<&str> = b.into(); + let c = a.clone(); + + // Uses `dbg` to print, will move `c` (for temporary debugging purposes). + dbg!(c); + + { + // `Arc` can be used to delegate dynamic dispatch and the following is an example. + // Both `i32` and `&str` implement `Display`. This enables us to express a unified + // behaviour, contract or protocol on both `i32` and `&str` into a single `Arc` of + // type `Arc<dyn Display>`. + + use kernel::fmt::Display; + fn arc_dyn_print(arc: &Arc<dyn Display>) { + pr_info!("Arc<dyn Display> says {arc}"); + } + + let a_i32_display: Arc<dyn Display> = Arc::new(42i32, GFP_KERNEL)?; + let a_str_display: Arc<dyn Display> = a.clone(); + + arc_dyn_print(&a_i32_display); + arc_dyn_print(&a_str_display); + } + + // Pretty-prints the debug formatting with lower-case hexadecimal integers. + pr_info!("{:#x?}", a); + + Ok(()) +} + +impl kernel::Module for RustPrint { + fn init(_module: &'static ThisModule) -> Result<Self> { + pr_info!("Rust printing macros sample (init)\n"); + + pr_emerg!("Emergency message (level 0) without args\n"); + pr_alert!("Alert message (level 1) without args\n"); + pr_crit!("Critical message (level 2) without args\n"); + pr_err!("Error message (level 3) without args\n"); + pr_warn!("Warning message (level 4) without args\n"); + pr_notice!("Notice message (level 5) without args\n"); + pr_info!("Info message (level 6) without args\n"); + + pr_info!("A line that"); + pr_cont!(" is continued"); + pr_cont!(" without args\n"); + + pr_emerg!("{} message (level {}) with args\n", "Emergency", 0); + pr_alert!("{} message (level {}) with args\n", "Alert", 1); + pr_crit!("{} message (level {}) with args\n", "Critical", 2); + pr_err!("{} message (level {}) with args\n", "Error", 3); + pr_warn!("{} message (level {}) with args\n", "Warning", 4); + pr_notice!("{} message (level {}) with args\n", "Notice", 5); + pr_info!("{} message (level {}) with args\n", "Info", 6); + + pr_info!("A {} that", "line"); + pr_cont!(" is {}", "continued"); + pr_cont!(" with {}\n", "args"); + + arc_print()?; + + trace::trace_rust_sample_loaded(42); + + Ok(RustPrint) + } +} + +impl Drop for RustPrint { + fn drop(&mut self) { + pr_info!("Rust printing macros sample (exit)\n"); + } +} + +mod trace { + use kernel::ffi::c_int; + + kernel::declare_trace! { + /// # Safety + /// + /// Always safe to call. + unsafe fn rust_sample_loaded(magic: c_int); + } + + pub(crate) fn trace_rust_sample_loaded(magic: i32) { + // SAFETY: Always safe to call. + unsafe { rust_sample_loaded(magic as c_int) } + } +} diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore index 78fb78184291..a6df0da77c5d 100644 --- a/samples/seccomp/.gitignore +++ b/samples/seccomp/.gitignore @@ -1,3 +1,5 @@ -bpf-direct -bpf-fancy -dropper +# SPDX-License-Identifier: GPL-2.0-only +/bpf-direct +/bpf-fancy +/dropper +/user-trap diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index bf7cc6b0dc19..c85ae0ed8342 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,48 +1,6 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += bpf-fancy dropper bpf-direct user-trap -hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct - -HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include -HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include bpf-fancy-objs := bpf-fancy.o bpf-helper.o -HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include -HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include -dropper-objs := dropper.o - -HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include -HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include -bpf-direct-objs := bpf-direct.o - -# Try to match the kernel target. -ifndef CROSS_COMPILE -ifndef CONFIG_64BIT - -# s390 has -m31 flag to build 31 bit binaries -ifndef CONFIG_S390 -MFLAG = -m32 -else -MFLAG = -m31 -endif - -HOSTCFLAGS_bpf-direct.o += $(MFLAG) -HOSTCFLAGS_dropper.o += $(MFLAG) -HOSTCFLAGS_bpf-helper.o += $(MFLAG) -HOSTCFLAGS_bpf-fancy.o += $(MFLAG) -HOSTLOADLIBES_bpf-direct += $(MFLAG) -HOSTLOADLIBES_bpf-fancy += $(MFLAG) -HOSTLOADLIBES_dropper += $(MFLAG) -endif -always := $(hostprogs-m) -else -# MIPS system calls are defined based on the -mabi that is passed -# to the toolchain which may or may not be a valid option -# for the host toolchain. So disable tests if target architecture -# is MIPS but the host isn't. -ifndef CONFIG_MIPS -always := $(hostprogs-m) -endif -endif +userccflags += -I usr/include diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c index 151ec3f52189..c09e4a17ac1a 100644 --- a/samples/seccomp/bpf-direct.c +++ b/samples/seccomp/bpf-direct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros * diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c index e8b24f443709..1ccb435025b6 100644 --- a/samples/seccomp/bpf-fancy.c +++ b/samples/seccomp/bpf-fancy.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Seccomp BPF example using a macro-based generator. * diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c index 1ef0f4d72898..ae260d77a868 100644 --- a/samples/seccomp/bpf-helper.c +++ b/samples/seccomp/bpf-helper.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Seccomp BPF helper functions * diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 1d8de9edd858..417e48a4c4df 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Example wrapper around BPF macros. * @@ -61,9 +62,9 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define EXPAND(...) __VA_ARGS__ /* Ensure that we load the logically correct offset. */ -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) #else #error "Unknown endianness" @@ -84,10 +85,10 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #elif __BITS_PER_LONG == 64 /* Ensure that we load the logically correct offset. */ -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define ENDIAN(_lo, _hi) _lo, _hi #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define ENDIAN(_lo, _hi) _hi, _lo #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) #endif diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c index 68325ca5e71c..4bca4b70f665 100644 --- a/samples/seccomp/dropper.c +++ b/samples/seccomp/dropper.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Naive system call dropper built on seccomp_filter. * @@ -24,7 +25,7 @@ #include <sys/prctl.h> #include <unistd.h> -static int install_filter(int nr, int arch, int error) +static int install_filter(int arch, int nr, int error) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, @@ -41,6 +42,10 @@ static int install_filter(int nr, int arch, int error) .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), .filter = filter, }; + if (error == -1) { + struct sock_filter kill = BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL); + filter[4] = kill; + } if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { perror("prctl(NO_NEW_PRIVS)"); return 1; @@ -56,9 +61,10 @@ int main(int argc, char **argv) { if (argc < 5) { fprintf(stderr, "Usage:\n" - "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n" + "dropper <arch> <syscall_nr> <errno> <prog> [<args>]\n" "Hint: AUDIT_ARCH_I386: 0x%X\n" " AUDIT_ARCH_X86_64: 0x%X\n" + " errno == -1 means SECCOMP_RET_KILL\n" "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); return 1; } diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c new file mode 100644 index 000000000000..a23fec357b5d --- /dev/null +++ b/samples/seccomp/user-trap.c @@ -0,0 +1,379 @@ +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <stddef.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/user.h> +#include <sys/ioctl.h> +#include <sys/ptrace.h> +#include <sys/mount.h> +#include <linux/limits.h> +#include <linux/filter.h> +#include <linux/seccomp.h> + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, args); +} + +static int send_fd(int sock, int fd) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + int *fd_ptr; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + fd_ptr = (int *)CMSG_DATA(cmsg); + *fd_ptr = fd; + msg.msg_controllen = cmsg->cmsg_len; + + if (sendmsg(sock, &msg, 0) < 0) { + perror("sendmsg"); + return -1; + } + + return 0; +} + +static int recv_fd(int sock) +{ + struct msghdr msg = {}; + struct cmsghdr *cmsg; + int *fd_ptr; + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; + struct iovec io = { + .iov_base = &c, + .iov_len = 1, + }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + if (recvmsg(sock, &msg, 0) < 0) { + perror("recvmsg"); + return -1; + } + + cmsg = CMSG_FIRSTHDR(&msg); + fd_ptr = (int *)CMSG_DATA(cmsg); + + return *fd_ptr; +} + +static int user_trap_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +static int handle_req(struct seccomp_notif *req, + struct seccomp_notif_resp *resp, int listener) +{ + char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; + int ret = -1, mem; + + resp->id = req->id; + resp->error = -EPERM; + resp->val = 0; + + if (req->data.nr != __NR_mount) { + fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); + return -1; + } + + /* Only allow bind mounts. */ + if (!(req->data.args[3] & MS_BIND)) + return 0; + + /* + * Ok, let's read the task's memory to see where they wanted their + * mount to go. + */ + snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); + mem = open(path, O_RDONLY); + if (mem < 0) { + perror("open mem"); + return -1; + } + + /* + * Now we avoid a TOCTOU: we referred to a pid by its pid, but since + * the pid that made the syscall may have died, we need to confirm that + * the pid is still valid after we open its /proc/pid/mem file. We can + * ask the listener fd this as follows. + * + * Note that this check should occur *after* any task-specific + * resources are opened, to make sure that the task has not died and + * we're not wrongly reading someone else's state in order to make + * decisions. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { + fprintf(stderr, "task died before we could map its memory\n"); + goto out; + } + + /* + * Phew, we've got the right /proc/pid/mem. Now we can read it. Note + * that to avoid another TOCTOU, we should read all of the pointer args + * before we decide to allow the syscall. + */ + if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, source, sizeof(source)); + if (ret < 0) { + perror("read"); + goto out; + } + + if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { + perror("seek"); + goto out; + } + + ret = read(mem, target, sizeof(target)); + if (ret < 0) { + perror("read"); + goto out; + } + + /* + * Our policy is to only allow bind mounts inside /tmp. This isn't very + * interesting, because we could do unprivlieged bind mounts with user + * namespaces already, but you get the idea. + */ + if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { + if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { + ret = -1; + perror("actual mount"); + goto out; + } + resp->error = 0; + } + + /* Even if we didn't allow it because of policy, generating the + * response was be a success, because we want to tell the worker EPERM. + */ + ret = 0; + +out: + close(mem); + return ret; +} + +int main(void) +{ + int sk_pair[2], ret = 1, status, listener; + pid_t worker = 0 , tracer = 0; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { + perror("socketpair"); + return 1; + } + + worker = fork(); + if (worker < 0) { + perror("fork"); + goto close_pair; + } + + if (worker == 0) { + listener = user_trap_syscall(__NR_mount, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) { + perror("seccomp"); + exit(1); + } + + /* + * Drop privileges. We definitely can't mount as uid 1000. + */ + if (setuid(1000) < 0) { + perror("setuid"); + exit(1); + } + + /* + * Send the listener to the parent; also serves as + * synchronization. + */ + if (send_fd(sk_pair[1], listener) < 0) + exit(1); + close(listener); + + if (mkdir("/tmp/foo", 0755) < 0) { + perror("mkdir"); + exit(1); + } + + /* + * Try a bad mount just for grins. + */ + if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { + fprintf(stderr, "huh? mounted /dev/sda?\n"); + exit(1); + } + + if (errno != EPERM) { + perror("bad error from mount"); + exit(1); + } + + /* + * Ok, we expect this one to succeed. + */ + if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { + perror("mount"); + exit(1); + } + + exit(0); + } + + /* + * Get the listener from the child. + */ + listener = recv_fd(sk_pair[0]); + if (listener < 0) + goto out_kill; + + /* + * Fork a task to handle the requests. This isn't strictly necessary, + * but it makes the particular writing of this sample easier, since we + * can just wait ofr the tracee to exit and kill the tracer. + */ + tracer = fork(); + if (tracer < 0) { + perror("fork"); + goto out_kill; + } + + if (tracer == 0) { + struct seccomp_notif *req; + struct seccomp_notif_resp *resp; + struct seccomp_notif_sizes sizes; + + if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { + perror("seccomp(GET_NOTIF_SIZES)"); + goto out_close; + } + + req = malloc(sizes.seccomp_notif); + if (!req) + goto out_close; + + resp = malloc(sizes.seccomp_notif_resp); + if (!resp) + goto out_req; + memset(resp, 0, sizes.seccomp_notif_resp); + + while (1) { + memset(req, 0, sizes.seccomp_notif); + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { + perror("ioctl recv"); + goto out_resp; + } + + if (handle_req(req, resp, listener) < 0) + goto out_resp; + + /* + * ENOENT here means that the task may have gotten a + * signal and restarted the syscall. It's up to the + * handler to decide what to do in this case, but for + * the sample code, we just ignore it. Probably + * something better should happen, like undoing the + * mount, or keeping track of the args to make sure we + * don't do it again. + */ + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && + errno != ENOENT) { + perror("ioctl send"); + goto out_resp; + } + } +out_resp: + free(resp); +out_req: + free(req); +out_close: + close(listener); + exit(1); + } + + close(listener); + + if (waitpid(worker, &status, 0) != worker) { + perror("waitpid"); + goto out_kill; + } + + if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { + perror("umount2"); + goto out_kill; + } + + if (remove("/tmp/foo") < 0 && errno != ENOENT) { + perror("remove"); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fprintf(stderr, "worker exited nonzero\n"); + goto out_kill; + } + + ret = 0; + +out_kill: + if (tracer > 0) + kill(tracer, SIGKILL); + if (worker > 0) + kill(worker, SIGKILL); + +close_pair: + close(sk_pair[0]); + close(sk_pair[1]); + return ret; +} diff --git a/samples/statx/Makefile b/samples/statx/Makefile deleted file mode 100644 index 1f80a3d8cf45..000000000000 --- a/samples/statx/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - -# List of programs to build -hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore index c5c45d7ec0df..cd9ff7b95383 100644 --- a/samples/timers/.gitignore +++ b/samples/timers/.gitignore @@ -1 +1,2 @@ -hpet_example +# SPDX-License-Identifier: GPL-2.0-only +/hpet_example diff --git a/samples/timers/Makefile b/samples/timers/Makefile index a5c3c4a35ca1..e6836cdea4e2 100644 --- a/samples/timers/Makefile +++ b/samples/timers/Makefile @@ -1,15 +1,4 @@ -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += hpet_example -ifeq ($(ARCH),x86) -CC := $(CROSS_COMPILE)gcc -PROGS := hpet_example - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - -endif -endif +userccflags += -I usr/include diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c index 3ab4993d85e0..f1cb622f6ec0 100644 --- a/samples/timers/hpet_example.c +++ b/samples/timers/hpet_example.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <stdlib.h> #include <unistd.h> diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile index 0f8d92120c4e..b3808bb4cf8b 100644 --- a/samples/trace_events/Makefile +++ b/samples/trace_events/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # builds the trace events example kernel modules; # then to use one (as root): insmod <module_name.ko> @@ -10,5 +11,7 @@ # Here trace-events-sample.c does the CREATE_TRACE_POINTS. # CFLAGS_trace-events-sample.o := -I$(src) +CFLAGS_trace_custom_sched.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o +obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_custom_sched.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index bc7fcf010a5b..ecc7db237f2e 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/kthread.h> @@ -18,8 +19,10 @@ static const char *random_strings[] = { "One ring to rule them all" }; -static void simple_thread_func(int cnt) +static void do_simple_thread_func(int cnt, const char *fmt, ...) { + unsigned long bitmask[1] = {0xdeadbeefUL}; + va_list va; int array[6]; int len = cnt % 5; int i; @@ -31,9 +34,13 @@ static void simple_thread_func(int cnt) array[i] = i + 1; array[i] = 0; + va_start(va, fmt); + /* Silly tracepoints */ trace_foo_bar("hello", cnt, array, random_strings[len], - ¤t->cpus_allowed); + current->cpus_ptr, fmt, &va); + + va_end(va); trace_foo_with_template_simple("HELLO", cnt); @@ -42,6 +49,13 @@ static void simple_thread_func(int cnt) trace_foo_with_template_cond("prints other times", cnt); trace_foo_with_template_print("I have to be different", cnt); + + trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask, current->cpus_ptr); +} + +static void simple_thread_func(int cnt) +{ + do_simple_thread_func(cnt, "iter=%d", cnt); } static int simple_thread(void *arg) @@ -78,29 +92,37 @@ static int simple_thread_fn(void *arg) } static DEFINE_MUTEX(thread_mutex); +static int simple_thread_cnt; int foo_bar_reg(void) { + mutex_lock(&thread_mutex); + if (simple_thread_cnt++) + goto out; + pr_info("Starting thread for foo_bar_fn\n"); /* * We shouldn't be able to start a trace when the module is * unloading (there's other locks to prevent that). But * for consistency sake, we still take the thread_mutex. */ - mutex_lock(&thread_mutex); simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn"); + out: mutex_unlock(&thread_mutex); return 0; } void foo_bar_unreg(void) { - pr_info("Killing thread for foo_bar_fn\n"); - /* protect against module unloading */ mutex_lock(&thread_mutex); + if (--simple_thread_cnt) + goto out; + + pr_info("Killing thread for foo_bar_fn\n"); if (simple_tsk_fn) kthread_stop(simple_tsk_fn); simple_tsk_fn = NULL; + out: mutex_unlock(&thread_mutex); } diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 76a75ab7a608..1a05fc153353 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * If TRACE_SYSTEM is defined, that will be the directory created * in the ftrace directory under /sys/kernel/tracing/events/<system> @@ -95,7 +96,7 @@ * __entry->bar.x = y; * __array: There are three fields (type, name, size). The type is the - * type of elements in teh array, the name is the name of the array. + * type of elements in the array, the name is the name of the array. * size is the number of items in the array (not the total size). * * __array( char, foo, 10) is the same as saying: char foo[10]; @@ -112,7 +113,7 @@ * type is the type of the element, name is the name of the array. * The size is different than __array. It is not a static number, * but the algorithm to figure out the length of the array for the - * specific instance of tracepoint. Again, size is the numebr of + * specific instance of tracepoint. Again, size is the number of * items in the array, not the total length in bytes. * * __dynamic_array( int, foo, bar) is similar to: int foo[bar]; @@ -125,9 +126,9 @@ * Notice, that "__entry" is not needed here. * * __string: This is a special kind of __dynamic_array. It expects to - * have a nul terminated character array passed to it (it allows + * have a null terminated character array passed to it (it allows * for NULL too, which would be converted into "(null)"). __string - * takes two paramenter (name, src), where name is the name of + * takes two parameter (name, src), where name is the name of * the string saved, and src is the string to copy into the * ring buffer. * @@ -135,10 +136,60 @@ * * To assign a string, use the helper macro __assign_str(). * - * __assign_str(foo, bar); + * __assign_str(foo); * - * In most cases, the __assign_str() macro will take the same - * parameters as the __string() macro had to declare the string. + * The __string() macro saves off the string that is passed into + * the second parameter, and the __assign_str() will store than + * saved string into the "foo" field. + * + * __vstring: This is similar to __string() but instead of taking a + * dynamic length, it takes a variable list va_list 'va' variable. + * Some event callers already have a message from parameters saved + * in a va_list. Passing in the format and the va_list variable + * will save just enough on the ring buffer for that string. + * Note, the va variable used is a pointer to a va_list, not + * to the va_list directly. + * + * (va_list *va) + * + * __vstring(foo, fmt, va) is similar to: vsnprintf(foo, fmt, va) + * + * To assign the string, use the helper macro __assign_vstr(). + * + * __assign_vstr(foo, fmt, va); + * + * In most cases, the __assign_vstr() macro will take the same + * parameters as the __vstring() macro had to declare the string. + * Use __get_str() to retrieve the __vstring() just like it would for + * __string(). + * + * __string_len: This is a helper to a __dynamic_array, but it understands + * that the array has characters in it, it will allocate 'len' + 1 bytes + * in the ring buffer and add a '\0' to the string. This is + * useful if the string being saved has no terminating '\0' byte. + * It requires that the length of the string is known as it acts + * like a memcpy(). + * + * Declared with: + * + * __string_len(foo, bar, len) + * + * To assign this string, use the helper macro __assign_str(). + * The length is saved via the __string_len() and is retrieved in + * __assign_str(). + * + * __assign_str(foo); + * + * Then len + 1 is allocated to the ring buffer, and a nul terminating + * byte is added. This is similar to: + * + * memcpy(__get_str(foo), bar, len); + * __get_str(foo)[len] = 0; + * + * The advantage of using this over __dynamic_array, is that it + * takes care of allocating the extra byte on the ring buffer + * for the '\0' terminating byte, and __get_str(foo) can be used + * in the TP_printk(). * * __bitmask: This is another kind of __dynamic_array, but it expects * an array of longs, and the number of bits to parse. It takes @@ -151,6 +202,16 @@ * * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits); * + * __cpumask: This is pretty much the same as __bitmask but is specific for + * CPU masks. The type displayed to the user via the format files will + * be "cpumaks_t" such that user space may deal with them differently + * if they choose to do so, and the bits is always set to nr_cpumask_bits. + * + * __cpumask(target_cpu) + * + * To assign a cpumask, use the __assign_cpumask() helper macro. + * + * __assign_cpumask(target_cpus, cpumask_bits(bar)); * * fast_assign: This is a C like function that is used to store the items * into the ring buffer. A special variable called "__entry" will be the @@ -163,8 +224,8 @@ * This is also used to print out the data from the trace files. * Again, the __entry macro is used to access the data from the ring buffer. * - * Note, __dynamic_array, __string, and __bitmask require special helpers - * to access the data. + * Note, __dynamic_array, __string, __bitmask and __cpumask require special + * helpers to access the data. * * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo) * Use __get_dynamic_array_len(foo) to get the length of the array @@ -177,6 +238,8 @@ * * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus) * + * For __cpumask(target_cpus) use __get_cpumask(target_cpus) + * * * Note, that for both the assign and the printk, __entry is the handler * to the data structure in the ring buffer, and is defined by the @@ -228,9 +291,10 @@ TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO); TRACE_EVENT(foo_bar, TP_PROTO(const char *foo, int bar, const int *lst, - const char *string, const struct cpumask *mask), + const char *string, const struct cpumask *mask, + const char *fmt, va_list *va), - TP_ARGS(foo, bar, lst, string, mask), + TP_ARGS(foo, bar, lst, string, mask, fmt, va), TP_STRUCT__entry( __array( char, foo, 10 ) @@ -238,18 +302,25 @@ TRACE_EVENT(foo_bar, __dynamic_array(int, list, __length_of(lst)) __string( str, string ) __bitmask( cpus, num_possible_cpus() ) + __cpumask( cpum ) + __vstring( vstr, fmt, va ) + __string_len( lstr, foo, bar / 2 < strlen(foo) ? bar / 2 : strlen(foo) ) ), TP_fast_assign( - strlcpy(__entry->foo, foo, 10); + strscpy(__entry->foo, foo, 10); __entry->bar = bar; memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); - __assign_str(str, string); + __assign_str(str); + __assign_str(lstr); + __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); + __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl", + __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -293,7 +364,17 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus)) + +/* A shortcut is to use __print_dynamic_array for dynamic arrays */ + + __print_dynamic_array(list, sizeof(int)), + + __get_str(str), __get_str(lstr), + __get_bitmask(cpus), __get_cpumask(cpum), + __get_str(vstr), + __get_dynamic_array_len(cpus), + __get_dynamic_array_len(cpus), + __get_dynamic_array(cpus)) ); /* @@ -347,7 +428,7 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), @@ -388,7 +469,7 @@ TRACE_EVENT_FN(foo_bar_with_fn, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), @@ -415,7 +496,7 @@ TRACE_EVENT_FN(foo_bar_with_fn, * Note, TRACE_EVENT() itself is simply defined as: * * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ - * DEFINE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ + * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ * DEFINE_EVENT(name, name, proto, args) * * The DEFINE_EVENT() also can be declared with conditions and reg functions: @@ -435,7 +516,7 @@ DECLARE_EVENT_CLASS(foo_template, ), TP_fast_assign( - __assign_str(foo, foo); + __assign_str(foo); __entry->bar = bar; ), @@ -444,7 +525,7 @@ DECLARE_EVENT_CLASS(foo_template, /* * Here's a better way for the previous samples (except, the first - * exmaple had more fields and could not be used here). + * example had more fields and could not be used here). */ DEFINE_EVENT(foo_template, foo_with_template_simple, TP_PROTO(const char *foo, int bar), @@ -478,6 +559,42 @@ DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, TP_ARGS(foo, bar), TP_printk("bar %s %d", __get_str(foo), __entry->bar)); +/* + * There are yet another __rel_loc dynamic data attribute. If you + * use __rel_dynamic_array() and __rel_string() etc. macros, you + * can use this attribute. There is no difference from the viewpoint + * of functionality with/without 'rel' but the encoding is a bit + * different. This is expected to be used with user-space event, + * there is no reason that the kernel event use this, but only for + * testing. + */ + +TRACE_EVENT(foo_rel_loc, + + TP_PROTO(const char *foo, int bar, unsigned long *mask, const cpumask_t *cpus), + + TP_ARGS(foo, bar, mask, cpus), + + TP_STRUCT__entry( + __rel_string( foo, foo ) + __field( int, bar ) + __rel_bitmask( bitmask, + BITS_PER_BYTE * sizeof(unsigned long) ) + __rel_cpumask( cpumask ) + ), + + TP_fast_assign( + __assign_rel_str(foo); + __entry->bar = bar; + __assign_rel_bitmask(bitmask, mask, + BITS_PER_BYTE * sizeof(unsigned long)); + __assign_rel_cpumask(cpumask, cpus); + ), + + TP_printk("foo_rel_loc %s, %d, %s, %s", __get_rel_str(foo), __entry->bar, + __get_rel_bitmask(bitmask), + __get_rel_cpumask(cpumask)) +); #endif /***** NOTICE! The #if protection ends here. *****/ diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c new file mode 100644 index 000000000000..dd409b704b35 --- /dev/null +++ b/samples/trace_events/trace_custom_sched.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event tracer + * + * Copyright (C) 2022 Google Inc, Steven Rostedt <rostedt@goodmis.org> + */ + +#define pr_fmt(fmt) fmt + +#include <linux/trace_events.h> +#include <linux/module.h> +#include <linux/sched.h> + +/* + * Must include the event header that the custom event will attach to, + * from the C file, and not in the custom header file. + */ +#include <trace/events/sched.h> + +/* Declare CREATE_CUSTOM_TRACE_EVENTS before including custom header */ +#define CREATE_CUSTOM_TRACE_EVENTS + +#include "trace_custom_sched.h" + +/* + * As the trace events are not exported to modules, the use of + * for_each_kernel_tracepoint() is needed to find the trace event + * to attach to. The fct() function below, is a callback that + * will be called for every event. + * + * Helper functions are created by the TRACE_CUSTOM_EVENT() macro + * update the event. Those are of the form: + * + * trace_custom_event_<event>_update() + * + * Where <event> is the event to attach. + */ +static void fct(struct tracepoint *tp, void *priv) +{ + trace_custom_event_sched_switch_update(tp); + trace_custom_event_sched_waking_update(tp); +} + +static int __init trace_sched_init(void) +{ + for_each_kernel_tracepoint(fct, NULL); + return 0; +} + +static void __exit trace_sched_exit(void) +{ +} + +module_init(trace_sched_init); +module_exit(trace_sched_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("Custom scheduling events"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace_custom_sched.h b/samples/trace_events/trace_custom_sched.h new file mode 100644 index 000000000000..951388334a3f --- /dev/null +++ b/samples/trace_events/trace_custom_sched.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Like the headers that use TRACE_EVENT(), the TRACE_CUSTOM_EVENT() + * needs a header that allows for multiple inclusions. + * + * Test for a unique name (here we have _TRACE_CUSTOM_SCHED_H), + * also allowing to continue if TRACE_CUSTOM_MULTI_READ is defined. + */ +#if !defined(_TRACE_CUSTOM_SCHED_H) || defined(TRACE_CUSTOM_MULTI_READ) +#define _TRACE_CUSTOM_SCHED_H + +/* Include linux/trace_events.h for initial defines of TRACE_CUSTOM_EVENT() */ +#include <linux/trace_events.h> + +/* + * TRACE_CUSTOM_EVENT() is just like TRACE_EVENT(). The first parameter + * is the event name of an existing event where the TRACE_EVENT has been included + * in the C file before including this file. + */ +TRACE_CUSTOM_EVENT(sched_switch, + + /* + * The TP_PROTO() and TP_ARGS must match the trace event + * that the custom event is using. + */ + TP_PROTO(bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state), + + TP_ARGS(preempt, prev, next, prev_state), + + /* + * The next fields are where the customization happens. + * The TP_STRUCT__entry() defines what will be recorded + * in the ring buffer when the custom event triggers. + * + * The rest is just like the TRACE_EVENT() macro except that + * it uses the custom entry. + */ + TP_STRUCT__entry( + __field( unsigned short, prev_prio ) + __field( unsigned short, next_prio ) + __field( pid_t, next_pid ) + ), + + TP_fast_assign( + __entry->prev_prio = prev->prio; + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("prev_prio=%d next_pid=%d next_prio=%d", + __entry->prev_prio, __entry->next_pid, __entry->next_prio) +) + + +TRACE_CUSTOM_EVENT(sched_waking, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( unsigned short, prio ) + ), + + TP_fast_assign( + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("pid=%d prio=%d", __entry->pid, __entry->prio) +) +#endif +/* + * Just like the headers that create TRACE_EVENTs, the below must + * be outside the protection of the above #if block. + */ + +/* + * It is required that the Makefile includes: + * CFLAGS_<c_file>.o := -I$(src) + */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . + +/* + * It is requred that the TRACE_INCLUDE_FILE be the same + * as this file without the ".h". + */ +#define TRACE_INCLUDE_FILE trace_custom_sched +#include <trace/define_custom_trace.h> diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile index 19900ab2b00d..c0df36167d60 100644 --- a/samples/trace_printk/Makefile +++ b/samples/trace_printk/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # builds a module that calls various trace_printk routines # then to use one (as root): insmod <module_name.ko> diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c index e9e0040ff7be..cfc159580263 100644 --- a/samples/trace_printk/trace-printk.c +++ b/samples/trace_printk/trace-printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/kthread.h> #include <linux/irq_work.h> @@ -35,6 +36,7 @@ static int __init trace_printk_init(void) /* Kick off printing in irq context */ irq_work_queue(&irqwork); + irq_work_sync(&irqwork); trace_printk("This is a %s that will use trace_bprintk()\n", "static string"); diff --git a/samples/tsm-mr/Makefile b/samples/tsm-mr/Makefile new file mode 100644 index 000000000000..587c3947b3a7 --- /dev/null +++ b/samples/tsm-mr/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_SAMPLE_TSM_MR) += tsm_mr_sample.o diff --git a/samples/tsm-mr/tsm_mr_sample.c b/samples/tsm-mr/tsm_mr_sample.c new file mode 100644 index 000000000000..a2c652148639 --- /dev/null +++ b/samples/tsm-mr/tsm_mr_sample.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024-2005 Intel Corporation. All rights reserved. */ + +#define pr_fmt(x) KBUILD_MODNAME ": " x + +#include <linux/module.h> +#include <linux/tsm-mr.h> +#include <linux/miscdevice.h> +#include <crypto/hash.h> + +static struct { + u8 static_mr[SHA384_DIGEST_SIZE]; + u8 config_mr[SHA512_DIGEST_SIZE]; + u8 rtmr0[SHA256_DIGEST_SIZE]; + u8 rtmr1[SHA384_DIGEST_SIZE]; + u8 report_digest[SHA512_DIGEST_SIZE]; +} sample_report = { + .static_mr = "static_mr", + .config_mr = "config_mr", + .rtmr0 = "rtmr0", + .rtmr1 = "rtmr1", +}; + +static int sample_report_refresh(const struct tsm_measurements *tm) +{ + struct crypto_shash *tfm; + int rc; + + tfm = crypto_alloc_shash(hash_algo_name[HASH_ALGO_SHA512], 0, 0); + if (IS_ERR(tfm)) { + pr_err("crypto_alloc_shash failed: %ld\n", PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + + rc = crypto_shash_tfm_digest(tfm, (u8 *)&sample_report, + offsetof(typeof(sample_report), + report_digest), + sample_report.report_digest); + crypto_free_shash(tfm); + if (rc) + pr_err("crypto_shash_tfm_digest failed: %d\n", rc); + return rc; +} + +static int sample_report_extend_mr(const struct tsm_measurements *tm, + const struct tsm_measurement_register *mr, + const u8 *data) +{ + SHASH_DESC_ON_STACK(desc, 0); + int rc; + + desc->tfm = crypto_alloc_shash(hash_algo_name[mr->mr_hash], 0, 0); + if (IS_ERR(desc->tfm)) { + pr_err("crypto_alloc_shash failed: %ld\n", PTR_ERR(desc->tfm)); + return PTR_ERR(desc->tfm); + } + + rc = crypto_shash_init(desc); + if (!rc) + rc = crypto_shash_update(desc, mr->mr_value, mr->mr_size); + if (!rc) + rc = crypto_shash_finup(desc, data, mr->mr_size, mr->mr_value); + crypto_free_shash(desc->tfm); + if (rc) + pr_err("SHA calculation failed: %d\n", rc); + return rc; +} + +#define MR_(mr, hash) .mr_value = &sample_report.mr, TSM_MR_(mr, hash) +static const struct tsm_measurement_register sample_mrs[] = { + /* static MR, read-only */ + { MR_(static_mr, SHA384) }, + /* config MR, read-only */ + { MR_(config_mr, SHA512) | TSM_MR_F_NOHASH }, + /* RTMR, direct extension prohibited */ + { MR_(rtmr0, SHA256) | TSM_MR_F_LIVE }, + /* RTMR, direct extension allowed */ + { MR_(rtmr1, SHA384) | TSM_MR_F_RTMR }, + /* RTMR, crypto agile, alaised to rtmr0 and rtmr1, respectively */ + { .mr_value = &sample_report.rtmr0, + TSM_MR_(rtmr_crypto_agile, SHA256) | TSM_MR_F_RTMR }, + { .mr_value = &sample_report.rtmr1, + TSM_MR_(rtmr_crypto_agile, SHA384) | TSM_MR_F_RTMR }, + /* sha512 digest of the whole structure */ + { MR_(report_digest, SHA512) | TSM_MR_F_LIVE }, +}; +#undef MR_ + +static struct tsm_measurements sample_tm = { + .mrs = sample_mrs, + .nr_mrs = ARRAY_SIZE(sample_mrs), + .refresh = sample_report_refresh, + .write = sample_report_extend_mr, +}; + +static const struct attribute_group *sample_groups[] = { + NULL, + NULL, +}; + +static struct miscdevice sample_misc_dev = { + .name = KBUILD_MODNAME, + .minor = MISC_DYNAMIC_MINOR, + .groups = sample_groups, +}; + +static int __init tsm_mr_sample_init(void) +{ + int rc; + + sample_groups[0] = tsm_mr_create_attribute_group(&sample_tm); + if (IS_ERR(sample_groups[0])) + return PTR_ERR(sample_groups[0]); + + rc = misc_register(&sample_misc_dev); + if (rc) + tsm_mr_free_attribute_group(sample_groups[0]); + return rc; +} + +static void __exit tsm_mr_sample_exit(void) +{ + misc_deregister(&sample_misc_dev); + tsm_mr_free_attribute_group(sample_groups[0]); +} + +module_init(tsm_mr_sample_init); +module_exit(tsm_mr_sample_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Sample module using tsm-mr to expose emulated MRs"); diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore new file mode 100644 index 000000000000..0e0a5a929f5d --- /dev/null +++ b/samples/uhid/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/uhid-example diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile index c95a696560a7..0aa424ec4899 100644 --- a/samples/uhid/Makefile +++ b/samples/uhid/Makefile @@ -1,10 +1,4 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += uhid-example -# List of programs to build -hostprogs-y := uhid-example - -# Tell kbuild to always build the programs -always := $(hostprogs-y) - -HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include +userccflags += -I usr/include diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c index 7d58a4b8d324..015cb06a241e 100644 --- a/samples/uhid/uhid-example.c +++ b/samples/uhid/uhid-example.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * UHID Example * @@ -164,7 +165,7 @@ static int uhid_write(int fd, const struct uhid_event *ev) fprintf(stderr, "Cannot write to uhid: %m\n"); return -errno; } else if (ret != sizeof(*ev)) { - fprintf(stderr, "Wrong size written to uhid: %ld != %lu\n", + fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } else { @@ -235,7 +236,7 @@ static int event(int fd) fprintf(stderr, "Cannot read uhid-cdev: %m\n"); return -errno; } else if (ret != sizeof(ev)) { - fprintf(stderr, "Invalid size read from uhid-dev: %ld != %lu\n", + fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n", ret, sizeof(ev)); return -EFAULT; } diff --git a/samples/user_events/Makefile b/samples/user_events/Makefile new file mode 100644 index 000000000000..7252b589db57 --- /dev/null +++ b/samples/user_events/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall -I../../usr/include + +example: example.o +example.o: example.c diff --git a/samples/user_events/example.c b/samples/user_events/example.c new file mode 100644 index 000000000000..28165a096697 --- /dev/null +++ b/samples/user_events/example.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/uio.h> +#include <fcntl.h> +#include <stdio.h> +#include <unistd.h> +#include <linux/user_events.h> + +const char *data_file = "/sys/kernel/tracing/user_events_data"; +int enabled = 0; + +static int event_reg(int fd, const char *command, int *write, int *enabled) +{ + struct user_reg reg = {0}; + + reg.size = sizeof(reg); + reg.enable_bit = 31; + reg.enable_size = sizeof(*enabled); + reg.enable_addr = (__u64)enabled; + reg.name_args = (__u64)command; + + if (ioctl(fd, DIAG_IOCSREG, ®) == -1) + return -1; + + *write = reg.write_index; + + return 0; +} + +int main(int argc, char **argv) +{ + int data_fd, write; + struct iovec io[2]; + __u32 count = 0; + + data_fd = open(data_file, O_RDWR); + + if (event_reg(data_fd, "test u32 count", &write, &enabled) == -1) + return errno; + + /* Setup iovec */ + io[0].iov_base = &write; + io[0].iov_len = sizeof(write); + io[1].iov_base = &count; + io[1].iov_len = sizeof(count); +ask: + printf("Press enter to check status...\n"); + getchar(); + + /* Check if anyone is listening */ + if (enabled) { + /* Yep, trace out our data */ + writev(data_fd, (const struct iovec *)io, 2); + + /* Increase the count */ + count++; + + printf("Something was attached, wrote data\n"); + } + + goto ask; + + return 0; +} diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile index 65a351d75c95..f86ab1245810 100644 --- a/samples/v4l/Makefile +++ b/samples/v4l/Makefile @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c index 93b76c3220fd..69925d30329e 100644 --- a/samples/v4l/v4l2-pci-skeleton.c +++ b/samples/v4l/v4l2-pci-skeleton.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source * for use with other PCI drivers. @@ -6,19 +7,6 @@ * input 0 and an HDMI connector as input 1. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. - * - * This program is free software; you may redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. */ #include <linux/types.h> @@ -58,6 +46,7 @@ MODULE_LICENSE("GPL v2"); * @queue: vb2 video capture queue * @qlock: spinlock controlling access to buf_list and sequence * @buf_list: list of buffers queued for DMA + * @field: the field (TOP/BOTTOM/other) of the current buffer * @sequence: frame sequence counter */ struct skeleton { @@ -80,13 +69,13 @@ struct skeleton { }; struct skel_buffer { - struct vb2_buffer vb; + struct vb2_v4l2_buffer vb; struct list_head list; }; -static inline struct skel_buffer *to_skel_buffer(struct vb2_buffer *vb2) +static inline struct skel_buffer *to_skel_buffer(struct vb2_v4l2_buffer *vbuf) { - return container_of(vb2, struct skel_buffer, vb); + return container_of(vbuf, struct skel_buffer, vb); } static const struct pci_device_id skeleton_pci_tbl[] = { @@ -139,16 +128,16 @@ static irqreturn_t skeleton_irq(int irq, void *dev_id) spin_lock(&skel->qlock); list_del(&new_buf->list); spin_unlock(&skel->qlock); - v4l2_get_timestamp(&new_buf->vb.v4l2_buf.timestamp); - new_buf->vb.v4l2_buf.sequence = skel->sequence++; - new_buf->vb.v4l2_buf.field = skel->field; + new_buf->vb.vb2_buf.timestamp = ktime_get_ns(); + new_buf->vb.sequence = skel->sequence++; + new_buf->vb.field = skel->field; if (skel->format.field == V4L2_FIELD_ALTERNATE) { if (skel->field == V4L2_FIELD_BOTTOM) skel->field = V4L2_FIELD_TOP; else if (skel->field == V4L2_FIELD_TOP) skel->field = V4L2_FIELD_BOTTOM; } - vb2_buffer_done(&new_buf->vb, VB2_BUF_STATE_DONE); + vb2_buffer_done(&new_buf->vb.vb2_buf, VB2_BUF_STATE_DONE); } #endif return IRQ_HANDLED; @@ -166,6 +155,7 @@ static int queue_setup(struct vb2_queue *vq, unsigned int sizes[], struct device *alloc_devs[]) { struct skeleton *skel = vb2_get_drv_priv(vq); + unsigned int q_num_bufs = vb2_get_num_buffers(vq); skel->field = skel->format.field; if (skel->field == V4L2_FIELD_ALTERNATE) { @@ -178,8 +168,8 @@ static int queue_setup(struct vb2_queue *vq, skel->field = V4L2_FIELD_TOP; } - if (vq->num_buffers + *nbuffers < 3) - *nbuffers = 3 - vq->num_buffers; + if (q_num_bufs + *nbuffers < 3) + *nbuffers = 3 - q_num_bufs; if (*nplanes) return sizes[0] < skel->format.sizeimage ? -EINVAL : 0; @@ -212,8 +202,9 @@ static int buffer_prepare(struct vb2_buffer *vb) */ static void buffer_queue(struct vb2_buffer *vb) { + struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue); - struct skel_buffer *buf = to_skel_buffer(vb); + struct skel_buffer *buf = to_skel_buffer(vbuf); unsigned long flags; spin_lock_irqsave(&skel->qlock, flags); @@ -232,7 +223,7 @@ static void return_all_buffers(struct skeleton *skel, spin_lock_irqsave(&skel->qlock, flags); list_for_each_entry_safe(buf, node, &skel->buf_list, list) { - vb2_buffer_done(&buf->vb, state); + vb2_buffer_done(&buf->vb.vb2_buf, state); list_del(&buf->list); } spin_unlock_irqrestore(&skel->qlock, flags); @@ -278,18 +269,14 @@ static void stop_streaming(struct vb2_queue *vq) } /* - * The vb2 queue ops. Note that since q->lock is set we can use the standard - * vb2_ops_wait_prepare/finish helper functions. If q->lock would be NULL, - * then this driver would have to provide these ops. + * The vb2 queue ops. */ -static struct vb2_ops skel_qops = { +static const struct vb2_ops skel_qops = { .queue_setup = queue_setup, .buf_prepare = buffer_prepare, .buf_queue = buffer_queue, .start_streaming = start_streaming, .stop_streaming = stop_streaming, - .wait_prepare = vb2_ops_wait_prepare, - .wait_finish = vb2_ops_wait_finish, }; /* @@ -301,8 +288,8 @@ static int skeleton_querycap(struct file *file, void *priv, { struct skeleton *skel = video_drvdata(file); - strlcpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver)); - strlcpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card)); + strscpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver)); + strscpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s", pci_name(skel->pdev)); return 0; @@ -483,7 +470,7 @@ static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std) return 0; } -static int skeleton_s_dv_timings(struct file *file, void *_fh, +static int skeleton_s_dv_timings(struct file *file, void *priv, struct v4l2_dv_timings *timings) { struct skeleton *skel = video_drvdata(file); @@ -522,7 +509,7 @@ static int skeleton_s_dv_timings(struct file *file, void *_fh, return 0; } -static int skeleton_g_dv_timings(struct file *file, void *_fh, +static int skeleton_g_dv_timings(struct file *file, void *priv, struct v4l2_dv_timings *timings) { struct skeleton *skel = video_drvdata(file); @@ -535,7 +522,7 @@ static int skeleton_g_dv_timings(struct file *file, void *_fh, return 0; } -static int skeleton_enum_dv_timings(struct file *file, void *_fh, +static int skeleton_enum_dv_timings(struct file *file, void *priv, struct v4l2_enum_dv_timings *timings) { struct skeleton *skel = video_drvdata(file); @@ -557,7 +544,7 @@ static int skeleton_enum_dv_timings(struct file *file, void *_fh, * can lock but that the DMA engine it is connected to cannot handle * pixelclocks above a certain frequency), then -ERANGE is returned. */ -static int skeleton_query_dv_timings(struct file *file, void *_fh, +static int skeleton_query_dv_timings(struct file *file, void *priv, struct v4l2_dv_timings *timings) { struct skeleton *skel = video_drvdata(file); @@ -586,7 +573,7 @@ static int skeleton_query_dv_timings(struct file *file, void *_fh, return 0; } -static int skeleton_dv_timings_cap(struct file *file, void *fh, +static int skeleton_dv_timings_cap(struct file *file, void *priv, struct v4l2_dv_timings_cap *cap) { struct skeleton *skel = video_drvdata(file); @@ -607,11 +594,11 @@ static int skeleton_enum_input(struct file *file, void *priv, i->type = V4L2_INPUT_TYPE_CAMERA; if (i->index == 0) { i->std = SKEL_TVNORMS; - strlcpy(i->name, "S-Video", sizeof(i->name)); + strscpy(i->name, "S-Video", sizeof(i->name)); i->capabilities = V4L2_IN_CAP_STD; } else { i->std = 0; - strlcpy(i->name, "HDMI", sizeof(i->name)); + strscpy(i->name, "HDMI", sizeof(i->name)); i->capabilities = V4L2_IN_CAP_DV_TIMINGS; } return 0; @@ -764,7 +751,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_enable_device(pdev); if (ret) return ret; - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); if (ret) { dev_err(&pdev->dev, "no suitable DMA available.\n"); goto disable_pci; @@ -772,8 +759,10 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Allocate a new instance */ skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL); - if (!skel) - return -ENOMEM; + if (!skel) { + ret = -ENOMEM; + goto disable_pci; + } /* Allocate the interrupt */ ret = devm_request_irq(&pdev->dev, pdev->irq, @@ -828,7 +817,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * available before it can be started. The start_streaming() op * won't be called until at least this many buffers are queued up. */ - q->min_buffers_needed = 2; + q->min_queued_buffers = 2; /* * The serialization lock for the streaming ioctls. This is the same * as the main serialization lock, but if some of the non-streaming @@ -853,7 +842,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Initialize the video_device structure */ vdev = &skel->vdev; - strlcpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name)); + strscpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name)); /* * There is nothing to clean up, so release is set to an empty release * function. The release callback must be non-NULL. @@ -875,7 +864,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) vdev->tvnorms = SKEL_TVNORMS; video_set_drvdata(vdev, skel); - ret = video_register_device(vdev, VFL_TYPE_GRABBER, -1); + ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret) goto free_hdl; diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile index cbbd868a50a8..10d179c4fdeb 100644 --- a/samples/vfio-mdev/Makefile +++ b/samples/vfio-mdev/Makefile @@ -1 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o +obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o diff --git a/samples/vfio-mdev/README.rst b/samples/vfio-mdev/README.rst new file mode 100644 index 000000000000..b52eb37739c0 --- /dev/null +++ b/samples/vfio-mdev/README.rst @@ -0,0 +1,100 @@ +Using the mtty vfio-mdev sample code +==================================== + +mtty is a sample vfio-mdev driver that demonstrates how to use the mediated +device framework. + +The sample driver creates an mdev device that simulates a serial port over a PCI +card. + +1. Build and load the mtty.ko module. + + This step creates a dummy device, /sys/devices/virtual/mtty/mtty/ + + Files in this device directory in sysfs are similar to the following:: + + # tree /sys/devices/virtual/mtty/mtty/ + /sys/devices/virtual/mtty/mtty/ + |-- mdev_supported_types + | |-- mtty-1 + | | |-- available_instances + | | |-- create + | | |-- device_api + | | |-- devices + | | `-- name + | `-- mtty-2 + | |-- available_instances + | |-- create + | |-- device_api + | |-- devices + | `-- name + |-- mtty_dev + | `-- sample_mtty_dev + |-- power + | |-- autosuspend_delay_ms + | |-- control + | |-- runtime_active_time + | |-- runtime_status + | `-- runtime_suspended_time + |-- subsystem -> ../../../../class/mtty + `-- uevent + +2. Create a mediated device by using the dummy device that you created in the + previous step:: + + # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \ + /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create + +3. Add parameters to qemu-kvm:: + + -device vfio-pci,\ + sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 + +4. Boot the VM. + + In the Linux guest VM, with no hardware on the host, the device appears + as follows:: + + # lspci -s 00:05.0 -xxvv + 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) + Subsystem: Device 4348:3253 + Physical Slot: 5 + Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- + Stepping- SERR- FastB2B- DisINTx- + Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort- + <TAbort- <MAbort- >SERR- <PERR- INTx- + Interrupt: pin A routed to IRQ 10 + Region 0: I/O ports at c150 [size=8] + Region 1: I/O ports at c158 [size=8] + Kernel driver in use: serial + 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00 + 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00 + 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32 + 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00 + + In the Linux guest VM, dmesg output for the device is as follows: + + serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10 + 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A + 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A + + +5. In the Linux guest VM, check the serial ports:: + + # setserial -g /dev/ttyS* + /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4 + /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10 + /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10 + +6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or + /dev/ttyS2 with hardware flow control disabled. + +7. Type data on the minicom terminal or send data to the terminal emulation + program and read the data. + + Data is loop backed from hosts mtty driver. + +8. Destroy the mediated device that you created:: + + # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove + diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c new file mode 100644 index 000000000000..64ea19253ee3 --- /dev/null +++ b/samples/vfio-mdev/mbochs.c @@ -0,0 +1,1451 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Mediated virtual PCI display host device driver + * + * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is + * basically the vram memory bar and the bochs dispi interface vbe + * registers in the mmio register bar. Specifically it does *not* + * include any legacy vga stuff. Device looks a lot like "qemu -device + * secondary-vga". + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * based on mtty driver which is: + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/cdev.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <linux/dma-buf.h> +#include <linux/highmem.h> +#include <drm/drm_fourcc.h> +#include <drm/drm_rect.h> +#include <drm/drm_modeset_lock.h> +#include <drm/drm_property.h> +#include <drm/drm_plane.h> + + +#define VBE_DISPI_INDEX_ID 0x0 +#define VBE_DISPI_INDEX_XRES 0x1 +#define VBE_DISPI_INDEX_YRES 0x2 +#define VBE_DISPI_INDEX_BPP 0x3 +#define VBE_DISPI_INDEX_ENABLE 0x4 +#define VBE_DISPI_INDEX_BANK 0x5 +#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6 +#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7 +#define VBE_DISPI_INDEX_X_OFFSET 0x8 +#define VBE_DISPI_INDEX_Y_OFFSET 0x9 +#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa +#define VBE_DISPI_INDEX_COUNT 0xb + +#define VBE_DISPI_ID0 0xB0C0 +#define VBE_DISPI_ID1 0xB0C1 +#define VBE_DISPI_ID2 0xB0C2 +#define VBE_DISPI_ID3 0xB0C3 +#define VBE_DISPI_ID4 0xB0C4 +#define VBE_DISPI_ID5 0xB0C5 + +#define VBE_DISPI_DISABLED 0x00 +#define VBE_DISPI_ENABLED 0x01 +#define VBE_DISPI_GETCAPS 0x02 +#define VBE_DISPI_8BIT_DAC 0x20 +#define VBE_DISPI_LFB_ENABLED 0x40 +#define VBE_DISPI_NOCLEARMEM 0x80 + + +#define MBOCHS_NAME "mbochs" +#define MBOCHS_CLASS_NAME "mbochs" + +#define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS +#define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1) + +#define MBOCHS_CONFIG_SPACE_SIZE 0xff +#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE +#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE +#define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \ + MBOCHS_MMIO_BAR_SIZE) +#define MBOCHS_EDID_SIZE PAGE_SIZE +#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \ + MBOCHS_EDID_SIZE) + +#define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2) + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + + +MODULE_DESCRIPTION("Mediated virtual PCI display host device driver"); +MODULE_LICENSE("GPL v2"); + +static int max_mbytes = 256; +module_param_named(count, max_mbytes, int, 0444); +MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); + + +#define MBOCHS_TYPE_1 "small" +#define MBOCHS_TYPE_2 "medium" +#define MBOCHS_TYPE_3 "large" + +static struct mbochs_type { + struct mdev_type type; + u32 mbytes; + u32 max_x; + u32 max_y; +} mbochs_types[] = { + { + .type.sysfs_name = MBOCHS_TYPE_1, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, + .mbytes = 4, + .max_x = 800, + .max_y = 600, + }, { + .type.sysfs_name = MBOCHS_TYPE_2, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, + .mbytes = 16, + .max_x = 1920, + .max_y = 1440, + }, { + .type.sysfs_name = MBOCHS_TYPE_3, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, + .mbytes = 64, + .max_x = 0, + .max_y = 0, + }, +}; + +static struct mdev_type *mbochs_mdev_types[] = { + &mbochs_types[0].type, + &mbochs_types[1].type, + &mbochs_types[2].type, +}; + +static dev_t mbochs_devt; +static const struct class mbochs_class = { + .name = MBOCHS_CLASS_NAME, +}; +static struct cdev mbochs_cdev; +static struct device mbochs_dev; +static struct mdev_parent mbochs_parent; +static atomic_t mbochs_avail_mbytes; +static const struct vfio_device_ops mbochs_dev_ops; + +struct mbochs_mode { + u32 drm_format; + u32 bytepp; + u32 width; + u32 height; + u32 stride; + u32 __pad; + u64 offset; + u64 size; +}; + +struct mbochs_dmabuf { + struct mbochs_mode mode; + u32 id; + struct page **pages; + pgoff_t pagecount; + struct dma_buf *buf; + struct mdev_state *mdev_state; + struct list_head next; + bool unlinked; +}; + +/* State of each mdev device */ +struct mdev_state { + struct vfio_device vdev; + u8 *vconfig; + u64 bar_mask[3]; + u32 memory_bar_mask; + struct mutex ops_lock; + struct mdev_device *mdev; + + const struct mbochs_type *type; + u16 vbe[VBE_DISPI_INDEX_COUNT]; + u64 memsize; + struct page **pages; + pgoff_t pagecount; + struct vfio_region_gfx_edid edid_regs; + u8 edid_blob[0x400]; + + struct list_head dmabufs; + u32 active_id; + u32 next_id; +}; + +static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = { + [VBE_DISPI_INDEX_ID] = "id", + [VBE_DISPI_INDEX_XRES] = "xres", + [VBE_DISPI_INDEX_YRES] = "yres", + [VBE_DISPI_INDEX_BPP] = "bpp", + [VBE_DISPI_INDEX_ENABLE] = "enable", + [VBE_DISPI_INDEX_BANK] = "bank", + [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width", + [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height", + [VBE_DISPI_INDEX_X_OFFSET] = "x-offset", + [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset", + [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem", +}; + +static const char *vbe_name(u32 index) +{ + if (index < ARRAY_SIZE(vbe_name_list)) + return vbe_name_list[index]; + return "(invalid)"; +} + +static struct page *__mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff); +static struct page *mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff); + +static void mbochs_create_config_space(struct mdev_state *mdev_state) +{ + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], + 0x1234); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], + 0x1111); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], + PCI_SUBVENDOR_ID_REDHAT_QUMRANET); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], + PCI_SUBDEVICE_ID_QEMU); + + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], + PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], + PCI_CLASS_DISPLAY_OTHER); + mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32 | + PCI_BASE_ADDRESS_MEM_PREFETCH); + mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32); + mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1; +} + +static int mbochs_check_framebuffer(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + u16 *vbe = mdev_state->vbe; + u32 virt_width; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED)) + goto nofb; + + memset(mode, 0, sizeof(*mode)); + switch (vbe[VBE_DISPI_INDEX_BPP]) { + case 32: + mode->drm_format = DRM_FORMAT_XRGB8888; + mode->bytepp = 4; + break; + default: + dev_info_ratelimited(dev, "%s: bpp %d not supported\n", + __func__, vbe[VBE_DISPI_INDEX_BPP]); + goto nofb; + } + + mode->width = vbe[VBE_DISPI_INDEX_XRES]; + mode->height = vbe[VBE_DISPI_INDEX_YRES]; + virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH]; + if (virt_width < mode->width) + virt_width = mode->width; + mode->stride = virt_width * mode->bytepp; + mode->size = (u64)mode->stride * mode->height; + mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp + + (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride); + + if (mode->width < 64 || mode->height < 64) { + dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n", + __func__, mode->width, mode->height); + goto nofb; + } + if (mode->offset + mode->size > mdev_state->memsize) { + dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n", + __func__); + goto nofb; + } + + return 0; + +nofb: + memset(mode, 0, sizeof(*mode)); + return -EINVAL; +} + +static bool mbochs_modes_equal(struct mbochs_mode *mode1, + struct mbochs_mode *mode2) +{ + return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0; +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int index = (offset - PCI_BASE_ADDRESS_0) / 0x04; + u32 cfg_addr; + + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_2: + cfg_addr = *(u32 *)buf; + + if (cfg_addr == 0xffffffff) { + cfg_addr = (cfg_addr & mdev_state->bar_mask[index]); + } else { + cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; + if (cfg_addr) + dev_info(dev, "BAR #%d @ 0x%x\n", + index, cfg_addr); + } + + cfg_addr |= (mdev_state->vconfig[offset] & + ~PCI_BASE_ADDRESS_MEM_MASK); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + } +} + +static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int index; + u16 reg16; + + switch (offset) { + case 0x400 ... 0x41f: /* vga ioports remapped */ + goto unhandled; + case 0x500 ... 0x515: /* bochs dispi interface */ + if (count != 2) + goto unhandled; + index = (offset - 0x500) / 2; + reg16 = *(u16 *)buf; + if (index < ARRAY_SIZE(mdev_state->vbe)) + mdev_state->vbe[index] = reg16; + dev_dbg(dev, "%s: vbe write %d = %d (%s)\n", + __func__, index, reg16, vbe_name(index)); + break; + case 0x600 ... 0x607: /* qemu extended regs */ + goto unhandled; + default: +unhandled: + dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", + __func__, offset, count); + break; + } +} + +static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + struct vfio_region_gfx_edid *edid; + u16 reg16 = 0; + int index; + + switch (offset) { + case 0x000 ... 0x3ff: /* edid block */ + edid = &mdev_state->edid_regs; + if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP || + offset >= edid->edid_size) { + memset(buf, 0, count); + break; + } + memcpy(buf, mdev_state->edid_blob + offset, count); + break; + case 0x500 ... 0x515: /* bochs dispi interface */ + if (count != 2) + goto unhandled; + index = (offset - 0x500) / 2; + if (index < ARRAY_SIZE(mdev_state->vbe)) + reg16 = mdev_state->vbe[index]; + dev_dbg(dev, "%s: vbe read %d = %d (%s)\n", + __func__, index, reg16, vbe_name(index)); + *(u16 *)buf = reg16; + break; + default: +unhandled: + dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", + __func__, offset, count); + memset(buf, 0, count); + break; + } +} + +static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + char *regs = (void *)&mdev_state->edid_regs; + + if (offset + count > sizeof(mdev_state->edid_regs)) + return; + if (count != 4) + return; + if (offset % 4) + return; + + if (is_write) { + switch (offset) { + case offsetof(struct vfio_region_gfx_edid, link_state): + case offsetof(struct vfio_region_gfx_edid, edid_size): + memcpy(regs + offset, buf, count); + break; + default: + /* read-only regs */ + break; + } + } else { + memcpy(buf, regs + offset, count); + } +} + +static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count, bool is_write) +{ + if (offset + count > mdev_state->edid_regs.edid_max_size) + return; + if (is_write) + memcpy(mdev_state->edid_blob + offset, buf, count); + else + memcpy(buf, mdev_state->edid_blob + offset, count); +} + +static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf, + size_t count, loff_t pos, bool is_write) +{ + struct page *pg; + loff_t poff; + char *map; + int ret = 0; + + mutex_lock(&mdev_state->ops_lock); + + if (pos < MBOCHS_CONFIG_SPACE_SIZE) { + if (is_write) + handle_pci_cfg_write(mdev_state, pos, buf, count); + else + memcpy(buf, (mdev_state->vconfig + pos), count); + + } else if (pos >= MBOCHS_MMIO_BAR_OFFSET && + pos + count <= (MBOCHS_MMIO_BAR_OFFSET + + MBOCHS_MMIO_BAR_SIZE)) { + pos -= MBOCHS_MMIO_BAR_OFFSET; + if (is_write) + handle_mmio_write(mdev_state, pos, buf, count); + else + handle_mmio_read(mdev_state, pos, buf, count); + + } else if (pos >= MBOCHS_EDID_OFFSET && + pos + count <= (MBOCHS_EDID_OFFSET + + MBOCHS_EDID_SIZE)) { + pos -= MBOCHS_EDID_OFFSET; + if (pos < MBOCHS_EDID_BLOB_OFFSET) { + handle_edid_regs(mdev_state, pos, buf, count, is_write); + } else { + pos -= MBOCHS_EDID_BLOB_OFFSET; + handle_edid_blob(mdev_state, pos, buf, count, is_write); + } + + } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET && + pos + count <= + MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) { + pos -= MBOCHS_MMIO_BAR_OFFSET; + poff = pos & ~PAGE_MASK; + pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT); + map = kmap(pg); + if (is_write) + memcpy(map + poff, buf, count); + else + memcpy(buf, map + poff, count); + kunmap(pg); + put_page(pg); + + } else { + dev_dbg(mdev_state->vdev.dev, "%s: %s @0x%llx (unhandled)\n", + __func__, is_write ? "WR" : "RD", pos); + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +static int mbochs_reset(struct mdev_state *mdev_state) +{ + u32 size64k = mdev_state->memsize / (64 * 1024); + int i; + + for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++) + mdev_state->vbe[i] = 0; + mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5; + mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k; + return 0; +} + +static int mbochs_init_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + struct mbochs_type *type = + container_of(mdev->type, struct mbochs_type, type); + int avail_mbytes = atomic_read(&mbochs_avail_mbytes); + int ret = -ENOMEM; + + do { + if (avail_mbytes < type->mbytes) + return -ENOSPC; + } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes, + avail_mbytes - type->mbytes)); + + mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) + goto err_avail; + + mdev_state->memsize = type->mbytes * 1024 * 1024; + mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; + mdev_state->pages = kcalloc(mdev_state->pagecount, + sizeof(struct page *), + GFP_KERNEL); + if (!mdev_state->pages) + goto err_vconfig; + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + INIT_LIST_HEAD(&mdev_state->dmabufs); + mdev_state->next_id = 1; + + mdev_state->type = type; + mdev_state->edid_regs.max_xres = type->max_x; + mdev_state->edid_regs.max_yres = type->max_y; + mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET; + mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob); + mbochs_create_config_space(mdev_state); + mbochs_reset(mdev_state); + + dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, + type->type.pretty_name, type->mbytes, mdev_state->pagecount); + return 0; + +err_vconfig: + kfree(mdev_state->vconfig); +err_avail: + atomic_add(type->mbytes, &mbochs_avail_mbytes); + return ret; +} + +static int mbochs_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret = -ENOMEM; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mbochs_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mbochs_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); + kfree(mdev_state->pages); + kfree(mdev_state->vconfig); +} + +static void mbochs_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); +} + +static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t mbochs_write(struct vfio_device *vdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static struct page *__mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff) +{ + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!mdev_state->pages[pgoff]) { + mdev_state->pages[pgoff] = + alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0); + if (!mdev_state->pages[pgoff]) + return NULL; + } + + get_page(mdev_state->pages[pgoff]); + return mdev_state->pages[pgoff]; +} + +static struct page *mbochs_get_page(struct mdev_state *mdev_state, + pgoff_t pgoff) +{ + struct page *page; + + if (WARN_ON(pgoff >= mdev_state->pagecount)) + return NULL; + + mutex_lock(&mdev_state->ops_lock); + page = __mbochs_get_page(mdev_state, pgoff); + mutex_unlock(&mdev_state->ops_lock); + + return page; +} + +static void mbochs_put_pages(struct mdev_state *mdev_state) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + int i, count = 0; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + for (i = 0; i < mdev_state->pagecount; i++) { + if (!mdev_state->pages[i]) + continue; + put_page(mdev_state->pages[i]); + mdev_state->pages[i] = NULL; + count++; + } + dev_dbg(dev, "%s: %d pages released\n", __func__, count); +} + +static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mdev_state *mdev_state = vma->vm_private_data; + pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; + + if (page_offset >= mdev_state->pagecount) + return VM_FAULT_SIGBUS; + + vmf->page = mbochs_get_page(mdev_state, page_offset); + if (!vmf->page) + return VM_FAULT_SIGBUS; + + return 0; +} + +static const struct vm_operations_struct mbochs_region_vm_ops = { + .fault = mbochs_region_vm_fault, +}; + +static int mbochs_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT) + return -EINVAL; + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if (vma->vm_end - vma->vm_start > mdev_state->memsize) + return -EINVAL; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + vma->vm_ops = &mbochs_region_vm_ops; + vma->vm_private_data = mdev_state; + return 0; +} + +static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mbochs_dmabuf *dmabuf = vma->vm_private_data; + + if (WARN_ON(vmf->pgoff >= dmabuf->pagecount)) + return VM_FAULT_SIGBUS; + + vmf->page = dmabuf->pages[vmf->pgoff]; + get_page(vmf->page); + return 0; +} + +static const struct vm_operations_struct mbochs_dmabuf_vm_ops = { + .fault = mbochs_dmabuf_vm_fault, +}; + +static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma) +{ + struct mbochs_dmabuf *dmabuf = buf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + vma->vm_ops = &mbochs_dmabuf_vm_ops; + vma->vm_private_data = dmabuf; + return 0; +} + +static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf, + const char *prefix) +{ + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + u32 fourcc = dmabuf->mode.drm_format; + + dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n", + prefix, dmabuf->id, + fourcc ? ((fourcc >> 0) & 0xff) : '-', + fourcc ? ((fourcc >> 8) & 0xff) : '-', + fourcc ? ((fourcc >> 16) & 0xff) : '-', + fourcc ? ((fourcc >> 24) & 0xff) : '-', + dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride, + dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount); +} + +static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at, + enum dma_data_direction direction) +{ + struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + struct sg_table *sg; + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + sg = kzalloc(sizeof(*sg), GFP_KERNEL); + if (!sg) + goto err1; + if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount, + 0, dmabuf->mode.size, GFP_KERNEL) < 0) + goto err2; + if (dma_map_sgtable(at->dev, sg, direction, 0)) + goto err3; + + return sg; + +err3: + sg_free_table(sg); +err2: + kfree(sg); +err1: + return ERR_PTR(-ENOMEM); +} + +static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at, + struct sg_table *sg, + enum dma_data_direction direction) +{ + struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; + struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + dma_unmap_sgtable(at->dev, sg, direction, 0); + sg_free_table(sg); + kfree(sg); +} + +static void mbochs_release_dmabuf(struct dma_buf *buf) +{ + struct mbochs_dmabuf *dmabuf = buf->priv; + struct mdev_state *mdev_state = dmabuf->mdev_state; + struct device *dev = mdev_dev(mdev_state->mdev); + pgoff_t pg; + + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + + for (pg = 0; pg < dmabuf->pagecount; pg++) + put_page(dmabuf->pages[pg]); + + mutex_lock(&mdev_state->ops_lock); + dmabuf->buf = NULL; + if (dmabuf->unlinked) + kfree(dmabuf); + mutex_unlock(&mdev_state->ops_lock); +} + +static struct dma_buf_ops mbochs_dmabuf_ops = { + .map_dma_buf = mbochs_map_dmabuf, + .unmap_dma_buf = mbochs_unmap_dmabuf, + .release = mbochs_release_dmabuf, + .mmap = mbochs_mmap_dmabuf, +}; + +static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct mbochs_dmabuf *dmabuf; + pgoff_t page_offset, pg; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL); + if (!dmabuf) + return NULL; + + dmabuf->mode = *mode; + dmabuf->id = mdev_state->next_id++; + dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE); + dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *), + GFP_KERNEL); + if (!dmabuf->pages) + goto err_free_dmabuf; + + page_offset = dmabuf->mode.offset >> PAGE_SHIFT; + for (pg = 0; pg < dmabuf->pagecount; pg++) { + dmabuf->pages[pg] = __mbochs_get_page(mdev_state, + page_offset + pg); + if (!dmabuf->pages[pg]) + goto err_free_pages; + } + + dmabuf->mdev_state = mdev_state; + list_add(&dmabuf->next, &mdev_state->dmabufs); + + mbochs_print_dmabuf(dmabuf, __func__); + return dmabuf; + +err_free_pages: + while (pg > 0) + put_page(dmabuf->pages[--pg]); + kfree(dmabuf->pages); +err_free_dmabuf: + kfree(dmabuf); + return NULL; +} + +static struct mbochs_dmabuf * +mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state, + struct mbochs_mode *mode) +{ + struct mbochs_dmabuf *dmabuf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) + if (mbochs_modes_equal(&dmabuf->mode, mode)) + return dmabuf; + + return NULL; +} + +static struct mbochs_dmabuf * +mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id) +{ + struct mbochs_dmabuf *dmabuf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) + if (dmabuf->id == id) + return dmabuf; + + return NULL; +} + +static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) +{ + struct mdev_state *mdev_state = dmabuf->mdev_state; + struct device *dev = mdev_state->vdev.dev; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct dma_buf *buf; + + WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); + + if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) { + dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n", + __func__); + return -EINVAL; + } + + exp_info.ops = &mbochs_dmabuf_ops; + exp_info.size = dmabuf->mode.size; + exp_info.priv = dmabuf; + + buf = dma_buf_export(&exp_info); + if (IS_ERR(buf)) { + dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n", + __func__, PTR_ERR(buf)); + return PTR_ERR(buf); + } + + dmabuf->buf = buf; + dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); + return 0; +} + +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + if (region_info->index >= MBOCHS_NUM_REGIONS) + return -EINVAL; + + switch (region_info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + region_info->offset = 0; + region_info->size = MBOCHS_CONFIG_SPACE_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case VFIO_PCI_BAR0_REGION_INDEX: + region_info->offset = MBOCHS_MEMORY_BAR_OFFSET; + region_info->size = mdev_state->memsize; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP); + break; + case VFIO_PCI_BAR2_REGION_INDEX: + region_info->offset = MBOCHS_MMIO_BAR_OFFSET; + region_info->size = MBOCHS_MMIO_BAR_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case MBOCHS_EDID_REGION_INDEX: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1, + .type = VFIO_REGION_TYPE_GFX, + .subtype = VFIO_REGION_SUBTYPE_GFX_EDID, + }; + + region_info->offset = MBOCHS_EDID_OFFSET; + region_info->size = MBOCHS_EDID_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + + return vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + } + default: + region_info->size = 0; + region_info->offset = 0; + region_info->flags = 0; + } + + return 0; +} + +static int mbochs_get_irq_info(struct vfio_irq_info *irq_info) +{ + irq_info->count = 0; + return 0; +} + +static int mbochs_get_device_info(struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = MBOCHS_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + return 0; +} + +static int mbochs_query_gfx_plane(struct mdev_state *mdev_state, + struct vfio_device_gfx_plane_info *plane) +{ + struct mbochs_dmabuf *dmabuf; + struct mbochs_mode mode; + int ret; + + if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { + if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | + VFIO_GFX_PLANE_TYPE_DMABUF)) + return 0; + return -EINVAL; + } + + if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF) + return -EINVAL; + + plane->drm_format_mod = 0; + plane->x_pos = 0; + plane->y_pos = 0; + plane->x_hot = 0; + plane->y_hot = 0; + + mutex_lock(&mdev_state->ops_lock); + + ret = -EINVAL; + if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY) + ret = mbochs_check_framebuffer(mdev_state, &mode); + if (ret < 0) { + plane->drm_format = 0; + plane->width = 0; + plane->height = 0; + plane->stride = 0; + plane->size = 0; + plane->dmabuf_id = 0; + goto done; + } + + dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode); + if (!dmabuf) + mbochs_dmabuf_alloc(mdev_state, &mode); + if (!dmabuf) { + mutex_unlock(&mdev_state->ops_lock); + return -ENOMEM; + } + + plane->drm_format = dmabuf->mode.drm_format; + plane->width = dmabuf->mode.width; + plane->height = dmabuf->mode.height; + plane->stride = dmabuf->mode.stride; + plane->size = dmabuf->mode.size; + plane->dmabuf_id = dmabuf->id; + +done: + if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY && + mdev_state->active_id != plane->dmabuf_id) { + dev_dbg(mdev_state->vdev.dev, "%s: primary: %d => %d\n", + __func__, mdev_state->active_id, plane->dmabuf_id); + mdev_state->active_id = plane->dmabuf_id; + } + mutex_unlock(&mdev_state->ops_lock); + return 0; +} + +static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) +{ + struct mbochs_dmabuf *dmabuf; + + mutex_lock(&mdev_state->ops_lock); + + dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id); + if (!dmabuf) { + mutex_unlock(&mdev_state->ops_lock); + return -ENOENT; + } + + if (!dmabuf->buf) + mbochs_dmabuf_export(dmabuf); + + mutex_unlock(&mdev_state->ops_lock); + + if (!dmabuf->buf) + return -EINVAL; + + return dma_buf_fd(dmabuf->buf, 0); +} + +static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, + unsigned long arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + int ret = 0; + unsigned long minsz; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mbochs_get_device_info(&info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= VFIO_PCI_NUM_IRQS)) + return -EINVAL; + + ret = mbochs_get_irq_info(&info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_QUERY_GFX_PLANE: + { + struct vfio_device_gfx_plane_info plane = {}; + + minsz = offsetofend(struct vfio_device_gfx_plane_info, + region_index); + + if (copy_from_user(&plane, (void __user *)arg, minsz)) + return -EFAULT; + + if (plane.argsz < minsz) + return -EINVAL; + + ret = mbochs_query_gfx_plane(mdev_state, &plane); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &plane, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_GFX_DMABUF: + { + u32 dmabuf_id; + + if (get_user(dmabuf_id, (__u32 __user *)arg)) + return -EFAULT; + + return mbochs_get_gfx_dmabuf(mdev_state, dmabuf_id); + } + + case VFIO_DEVICE_SET_IRQS: + return -EINVAL; + + case VFIO_DEVICE_RESET: + return mbochs_reset(mdev_state); + } + return -ENOTTY; +} + +static void mbochs_close_device(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mbochs_dmabuf *dmabuf, *tmp; + + mutex_lock(&mdev_state->ops_lock); + + list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) { + list_del(&dmabuf->next); + if (dmabuf->buf) { + /* free in mbochs_release_dmabuf() */ + dmabuf->unlinked = true; + } else { + kfree(dmabuf); + } + } + mbochs_put_pages(mdev_state); + + mutex_unlock(&mdev_state->ops_lock); +} + +static ssize_t +memory_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_state *mdev_state = dev_get_drvdata(dev); + + return sprintf(buf, "%d MB\n", mdev_state->type->mbytes); +} +static DEVICE_ATTR_RO(memory); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_memory.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +static const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t mbochs_show_description(struct mdev_type *mtype, char *buf) +{ + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); + + return sprintf(buf, "virtual display, %d MB video memory\n", + type ? type->mbytes : 0); +} + +static unsigned int mbochs_get_available(struct mdev_type *mtype) +{ + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); + + return atomic_read(&mbochs_avail_mbytes) / type->mbytes; +} + +static const struct vfio_device_ops mbochs_dev_ops = { + .close_device = mbochs_close_device, + .init = mbochs_init_dev, + .release = mbochs_release_dev, + .read = mbochs_read, + .write = mbochs_write, + .ioctl = mbochs_ioctl, + .get_region_info_caps = mbochs_ioctl_get_region_info, + .mmap = mbochs_mmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, +}; + +static struct mdev_driver mbochs_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .driver = { + .name = "mbochs", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mbochs_probe, + .remove = mbochs_remove, + .get_available = mbochs_get_available, + .show_description = mbochs_show_description, +}; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +static void mbochs_device_release(struct device *dev) +{ + /* nothing */ +} + +static int __init mbochs_dev_init(void) +{ + int ret = 0; + + atomic_set(&mbochs_avail_mbytes, max_mbytes); + + ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME); + if (ret < 0) { + pr_err("Error: failed to register mbochs_dev, err: %d\n", ret); + return ret; + } + cdev_init(&mbochs_cdev, &vd_fops); + cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1); + pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt)); + + ret = mdev_register_driver(&mbochs_driver); + if (ret) + goto err_cdev; + + ret = class_register(&mbochs_class); + if (ret) + goto err_driver; + mbochs_dev.class = &mbochs_class; + mbochs_dev.release = mbochs_device_release; + dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME); + + ret = device_register(&mbochs_dev); + if (ret) + goto err_put; + + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, + mbochs_mdev_types, + ARRAY_SIZE(mbochs_mdev_types)); + if (ret) + goto err_device; + + return 0; + +err_device: + device_del(&mbochs_dev); +err_put: + put_device(&mbochs_dev); + class_unregister(&mbochs_class); +err_driver: + mdev_unregister_driver(&mbochs_driver); +err_cdev: + cdev_del(&mbochs_cdev); + unregister_chrdev_region(mbochs_devt, MINORMASK + 1); + return ret; +} + +static void __exit mbochs_dev_exit(void) +{ + mbochs_dev.bus = NULL; + mdev_unregister_parent(&mbochs_parent); + + device_unregister(&mbochs_dev); + mdev_unregister_driver(&mbochs_driver); + cdev_del(&mbochs_cdev); + unregister_chrdev_region(mbochs_devt, MINORMASK + 1); + class_unregister(&mbochs_class); +} + +MODULE_IMPORT_NS("DMA_BUF"); +module_init(mbochs_dev_init) +module_exit(mbochs_dev_exit) diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h new file mode 100644 index 000000000000..961c55ec3ffd --- /dev/null +++ b/samples/vfio-mdev/mdpy-defs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Simple pci display device. + * + * Framebuffer memory is pci bar 0. + * Configuration (read-only) is in pci config space. + * Format field uses drm fourcc codes. + * ATM only DRM_FORMAT_XRGB8888 is supported. + */ + +/* pci ids */ +#define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT +#define MDPY_PCI_DEVICE_ID 0x000f +#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET +#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU + +/* pci cfg space offsets for fb config (dword) */ +#define MDPY_VENDORCAP_OFFSET 0x40 +#define MDPY_VENDORCAP_SIZE 0x10 +#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04) +#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08) +#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c) diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c new file mode 100644 index 000000000000..149af7f598f8 --- /dev/null +++ b/samples/vfio-mdev/mdpy-fb.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Framebuffer driver for mdpy (mediated virtual pci display device). + * + * See mdpy-defs.h for device specs + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * Using some code snippets from simplefb and cirrusfb. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/errno.h> +#include <linux/fb.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <drm/drm_fourcc.h> +#include "mdpy-defs.h" + +static const struct fb_fix_screeninfo mdpy_fb_fix = { + .id = "mdpy-fb", + .type = FB_TYPE_PACKED_PIXELS, + .visual = FB_VISUAL_TRUECOLOR, + .accel = FB_ACCEL_NONE, +}; + +static const struct fb_var_screeninfo mdpy_fb_var = { + .height = -1, + .width = -1, + .activate = FB_ACTIVATE_NOW, + .vmode = FB_VMODE_NONINTERLACED, + + .bits_per_pixel = 32, + .transp.offset = 24, + .red.offset = 16, + .green.offset = 8, + .blue.offset = 0, + .transp.length = 8, + .red.length = 8, + .green.length = 8, + .blue.length = 8, +}; + +#define PSEUDO_PALETTE_SIZE 16 + +struct mdpy_fb_par { + u32 palette[PSEUDO_PALETTE_SIZE]; +}; + +static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, + u_int transp, struct fb_info *info) +{ + u32 *pal = info->pseudo_palette; + u32 cr = red >> (16 - info->var.red.length); + u32 cg = green >> (16 - info->var.green.length); + u32 cb = blue >> (16 - info->var.blue.length); + u32 value, mask; + + if (regno >= PSEUDO_PALETTE_SIZE) + return -EINVAL; + + value = (cr << info->var.red.offset) | + (cg << info->var.green.offset) | + (cb << info->var.blue.offset); + if (info->var.transp.length > 0) { + mask = (1 << info->var.transp.length) - 1; + mask <<= info->var.transp.offset; + value |= mask; + } + pal[regno] = value; + + return 0; +} + +static void mdpy_fb_destroy(struct fb_info *info) +{ + if (info->screen_base) + iounmap(info->screen_base); +} + +static const struct fb_ops mdpy_fb_ops = { + .owner = THIS_MODULE, + FB_DEFAULT_IOMEM_OPS, + .fb_destroy = mdpy_fb_destroy, + .fb_setcolreg = mdpy_fb_setcolreg, +}; + +static int mdpy_fb_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct fb_info *info; + struct mdpy_fb_par *par; + u32 format, width, height; + int ret; + + ret = pci_enable_device(pdev); + if (ret < 0) + return ret; + + ret = pci_request_regions(pdev, "mdpy-fb"); + if (ret < 0) + goto err_disable_dev; + + pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); + pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); + pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height); + if (format != DRM_FORMAT_XRGB8888) { + pci_err(pdev, "format mismatch (0x%x != 0x%x)\n", + format, DRM_FORMAT_XRGB8888); + ret = -EINVAL; + goto err_release_regions; + } + if (width < 100 || width > 10000) { + pci_err(pdev, "width (%d) out of range\n", width); + ret = -EINVAL; + goto err_release_regions; + } + if (height < 100 || height > 10000) { + pci_err(pdev, "height (%d) out of range\n", height); + ret = -EINVAL; + goto err_release_regions; + } + pci_info(pdev, "mdpy found: %dx%d framebuffer\n", + width, height); + + info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev); + if (!info) { + ret = -ENOMEM; + goto err_release_regions; + } + pci_set_drvdata(pdev, info); + par = info->par; + + info->fix = mdpy_fb_fix; + info->fix.smem_start = pci_resource_start(pdev, 0); + info->fix.smem_len = pci_resource_len(pdev, 0); + info->fix.line_length = width * 4; + + info->var = mdpy_fb_var; + info->var.xres = width; + info->var.yres = height; + info->var.xres_virtual = width; + info->var.yres_virtual = height; + + info->screen_size = info->fix.smem_len; + info->screen_base = ioremap(info->fix.smem_start, + info->screen_size); + if (!info->screen_base) { + pci_err(pdev, "ioremap(pcibar) failed\n"); + ret = -EIO; + goto err_release_fb; + } + + info->fbops = &mdpy_fb_ops; + info->pseudo_palette = par->palette; + + ret = register_framebuffer(info); + if (ret < 0) { + pci_err(pdev, "mdpy-fb device register failed: %d\n", ret); + goto err_unmap; + } + + pci_info(pdev, "fb%d registered\n", info->node); + return 0; + +err_unmap: + iounmap(info->screen_base); + +err_release_fb: + framebuffer_release(info); + +err_release_regions: + pci_release_regions(pdev); + +err_disable_dev: + pci_disable_device(pdev); + + return ret; +} + +static void mdpy_fb_remove(struct pci_dev *pdev) +{ + struct fb_info *info = pci_get_drvdata(pdev); + + unregister_framebuffer(info); + iounmap(info->screen_base); + framebuffer_release(info); + pci_release_regions(pdev); + pci_disable_device(pdev); +} + +static struct pci_device_id mdpy_fb_pci_table[] = { + { + .vendor = MDPY_PCI_VENDOR_ID, + .device = MDPY_PCI_DEVICE_ID, + .subvendor = MDPY_PCI_SUBVENDOR_ID, + .subdevice = MDPY_PCI_SUBDEVICE_ID, + }, { + /* end of list */ + } +}; + +static struct pci_driver mdpy_fb_pci_driver = { + .name = "mdpy-fb", + .id_table = mdpy_fb_pci_table, + .probe = mdpy_fb_probe, + .remove = mdpy_fb_remove, +}; + +static int __init mdpy_fb_init(void) +{ + int ret; + + ret = pci_register_driver(&mdpy_fb_pci_driver); + if (ret) + return ret; + + return 0; +} + +module_init(mdpy_fb_init); + +MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table); +MODULE_DESCRIPTION("Framebuffer driver for mdpy (mediated virtual pci display device)"); +MODULE_LICENSE("GPL v2"); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c new file mode 100644 index 000000000000..0759bd68edca --- /dev/null +++ b/samples/vfio-mdev/mdpy.c @@ -0,0 +1,743 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Mediated virtual PCI display host device driver + * + * See mdpy-defs.h for device specs + * + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * based on mtty driver which is: + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/cdev.h> +#include <linux/vfio.h> +#include <linux/iommu.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> +#include <linux/pci.h> +#include <drm/drm_fourcc.h> +#include "mdpy-defs.h" + +#define MDPY_NAME "mdpy" +#define MDPY_CLASS_NAME "mdpy" + +#define MDPY_CONFIG_SPACE_SIZE 0xff +#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE +#define MDPY_DISPLAY_REGION 16 + +#define STORE_LE16(addr, val) (*(u16 *)addr = val) +#define STORE_LE32(addr, val) (*(u32 *)addr = val) + + +MODULE_DESCRIPTION("Mediated virtual PCI display host device driver"); +MODULE_LICENSE("GPL v2"); + +#define MDPY_TYPE_1 "vga" +#define MDPY_TYPE_2 "xga" +#define MDPY_TYPE_3 "hd" + +static struct mdpy_type { + struct mdev_type type; + u32 format; + u32 bytepp; + u32 width; + u32 height; +} mdpy_types[] = { + { + .type.sysfs_name = MDPY_TYPE_1, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 640, + .height = 480, + }, { + .type.sysfs_name = MDPY_TYPE_2, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 1024, + .height = 768, + }, { + .type.sysfs_name = MDPY_TYPE_3, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, + .format = DRM_FORMAT_XRGB8888, + .bytepp = 4, + .width = 1920, + .height = 1080, + }, +}; + +static struct mdev_type *mdpy_mdev_types[] = { + &mdpy_types[0].type, + &mdpy_types[1].type, + &mdpy_types[2].type, +}; + +static dev_t mdpy_devt; +static const struct class mdpy_class = { + .name = MDPY_CLASS_NAME, +}; +static struct cdev mdpy_cdev; +static struct device mdpy_dev; +static struct mdev_parent mdpy_parent; +static const struct vfio_device_ops mdpy_dev_ops; + +/* State of each mdev device */ +struct mdev_state { + struct vfio_device vdev; + u8 *vconfig; + u32 bar_mask; + struct mutex ops_lock; + struct mdev_device *mdev; + struct vfio_device_info dev_info; + + const struct mdpy_type *type; + u32 memsize; + void *memblk; +}; + +static void mdpy_create_config_space(struct mdev_state *mdev_state) +{ + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], + MDPY_PCI_VENDOR_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], + MDPY_PCI_DEVICE_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], + MDPY_PCI_SUBVENDOR_ID); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], + MDPY_PCI_SUBDEVICE_ID); + + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], + PCI_COMMAND_IO | PCI_COMMAND_MEMORY); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS], + PCI_STATUS_CAP_LIST); + STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], + PCI_CLASS_DISPLAY_OTHER); + mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; + + STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_32 | + PCI_BASE_ADDRESS_MEM_PREFETCH); + mdev_state->bar_mask = ~(mdev_state->memsize) + 1; + + /* vendor specific capability for the config registers */ + mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET; + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */ + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */ + mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE; + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET], + mdev_state->type->format); + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET], + mdev_state->type->width); + STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET], + mdev_state->type->height); +} + +static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, + char *buf, u32 count) +{ + struct device *dev = mdev_dev(mdev_state->mdev); + u32 cfg_addr; + + switch (offset) { + case PCI_BASE_ADDRESS_0: + cfg_addr = *(u32 *)buf; + + if (cfg_addr == 0xffffffff) { + cfg_addr = (cfg_addr & mdev_state->bar_mask); + } else { + cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; + if (cfg_addr) + dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr); + } + + cfg_addr |= (mdev_state->vconfig[offset] & + ~PCI_BASE_ADDRESS_MEM_MASK); + STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); + break; + } +} + +static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf, + size_t count, loff_t pos, bool is_write) +{ + int ret = 0; + + mutex_lock(&mdev_state->ops_lock); + + if (pos < MDPY_CONFIG_SPACE_SIZE) { + if (is_write) + handle_pci_cfg_write(mdev_state, pos, buf, count); + else + memcpy(buf, (mdev_state->vconfig + pos), count); + + } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) && + (pos + count <= + MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) { + pos -= MDPY_MEMORY_BAR_OFFSET; + if (is_write) + memcpy(mdev_state->memblk, buf, count); + else + memcpy(buf, mdev_state->memblk, count); + + } else { + dev_info(mdev_state->vdev.dev, + "%s: %s @0x%llx (unhandled)\n", __func__, + is_write ? "WR" : "RD", pos); + ret = -1; + goto accessfailed; + } + + ret = count; + + +accessfailed: + mutex_unlock(&mdev_state->ops_lock); + + return ret; +} + +static int mdpy_reset(struct mdev_state *mdev_state) +{ + u32 stride, i; + + /* initialize with gray gradient */ + stride = mdev_state->type->width * mdev_state->type->bytepp; + for (i = 0; i < mdev_state->type->height; i++) + memset(mdev_state->memblk + i * stride, + i * 255 / mdev_state->type->height, + stride); + return 0; +} + +static int mdpy_init_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + const struct mdpy_type *type = + container_of(mdev->type, struct mdpy_type, type); + u32 fbsize; + int ret = -ENOMEM; + + mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) + return ret; + + fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); + + mdev_state->memblk = vmalloc_user(fbsize); + if (!mdev_state->memblk) + goto out_vconfig; + + mutex_init(&mdev_state->ops_lock); + mdev_state->mdev = mdev; + mdev_state->type = type; + mdev_state->memsize = fbsize; + mdpy_create_config_space(mdev_state); + mdpy_reset(mdev_state); + + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, + type->width, type->height); + return 0; + +out_vconfig: + kfree(mdev_state->vconfig); + return ret; +} + +static int mdpy_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mdpy_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mdpy_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + vfree(mdev_state->memblk); + kfree(mdev_state->vconfig); +} + +static void mdpy_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + + dev_info(&mdev->dev, "%s\n", __func__); + + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); +} + +static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 2; + } else { + u8 val; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 1; + } + + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; + +read_err: + return -EFAULT; +} + +static ssize_t mdpy_write(struct vfio_device *vdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + unsigned int done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 4 && !(*ppos % 4)) { + u32 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 4; + } else if (count >= 2 && !(*ppos % 2)) { + u16 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = mdev_access(mdev_state, (char *)&val, sizeof(val), + *ppos, true); + if (ret <= 0) + goto write_err; + + filled = 1; + } + count -= filled; + done += filled; + *ppos += filled; + buf += filled; + } + + return done; +write_err: + return -EFAULT; +} + +static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT) + return -EINVAL; + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if (vma->vm_end - vma->vm_start > mdev_state->memsize) + return -EINVAL; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + return remap_vmalloc_range(vma, mdev_state->memblk, 0); +} + +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + if (region_info->index >= VFIO_PCI_NUM_REGIONS && + region_info->index != MDPY_DISPLAY_REGION) + return -EINVAL; + + switch (region_info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + region_info->offset = 0; + region_info->size = MDPY_CONFIG_SPACE_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE); + break; + case VFIO_PCI_BAR0_REGION_INDEX: + case MDPY_DISPLAY_REGION: + region_info->offset = MDPY_MEMORY_BAR_OFFSET; + region_info->size = mdev_state->memsize; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP); + break; + default: + region_info->size = 0; + region_info->offset = 0; + region_info->flags = 0; + } + + return 0; +} + +static int mdpy_get_irq_info(struct vfio_irq_info *irq_info) +{ + irq_info->count = 0; + return 0; +} + +static int mdpy_get_device_info(struct vfio_device_info *dev_info) +{ + dev_info->flags = VFIO_DEVICE_FLAGS_PCI; + dev_info->num_regions = VFIO_PCI_NUM_REGIONS; + dev_info->num_irqs = VFIO_PCI_NUM_IRQS; + return 0; +} + +static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, + struct vfio_device_gfx_plane_info *plane) +{ + if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { + if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | + VFIO_GFX_PLANE_TYPE_REGION)) + return 0; + return -EINVAL; + } + + if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION) + return -EINVAL; + + plane->drm_format = mdev_state->type->format; + plane->width = mdev_state->type->width; + plane->height = mdev_state->type->height; + plane->stride = (mdev_state->type->width * + mdev_state->type->bytepp); + plane->size = mdev_state->memsize; + plane->region_index = MDPY_DISPLAY_REGION; + + /* unused */ + plane->drm_format_mod = 0; + plane->x_pos = 0; + plane->y_pos = 0; + plane->x_hot = 0; + plane->y_hot = 0; + + return 0; +} + +static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, + unsigned long arg) +{ + int ret = 0; + unsigned long minsz; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mdpy_get_device_info(&info); + if (ret) + return ret; + + memcpy(&mdev_state->dev_info, &info, sizeof(info)); + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_GET_IRQ_INFO: + { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if ((info.argsz < minsz) || + (info.index >= mdev_state->dev_info.num_irqs)) + return -EINVAL; + + ret = mdpy_get_irq_info(&info); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_QUERY_GFX_PLANE: + { + struct vfio_device_gfx_plane_info plane = {}; + + minsz = offsetofend(struct vfio_device_gfx_plane_info, + region_index); + + if (copy_from_user(&plane, (void __user *)arg, minsz)) + return -EFAULT; + + if (plane.argsz < minsz) + return -EINVAL; + + ret = mdpy_query_gfx_plane(mdev_state, &plane); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &plane, minsz)) + return -EFAULT; + + return 0; + } + + case VFIO_DEVICE_SET_IRQS: + return -EINVAL; + + case VFIO_DEVICE_RESET: + return mdpy_reset(mdev_state); + } + return -ENOTTY; +} + +static ssize_t +resolution_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct mdev_state *mdev_state = dev_get_drvdata(dev); + + return sprintf(buf, "%dx%d\n", + mdev_state->type->width, + mdev_state->type->height); +} +static DEVICE_ATTR_RO(resolution); + +static struct attribute *mdev_dev_attrs[] = { + &dev_attr_resolution.attr, + NULL, +}; + +static const struct attribute_group mdev_dev_group = { + .name = "vendor", + .attrs = mdev_dev_attrs, +}; + +static const struct attribute_group *mdev_dev_groups[] = { + &mdev_dev_group, + NULL, +}; + +static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) +{ + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); + + return sprintf(buf, "virtual display, %dx%d framebuffer\n", + type->width, type->height); +} + +static const struct vfio_device_ops mdpy_dev_ops = { + .init = mdpy_init_dev, + .release = mdpy_release_dev, + .read = mdpy_read, + .write = mdpy_write, + .ioctl = mdpy_ioctl, + .get_region_info_caps = mdpy_ioctl_get_region_info, + .mmap = mdpy_mmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, +}; + +static struct mdev_driver mdpy_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .max_instances = 4, + .driver = { + .name = "mdpy", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mdpy_probe, + .remove = mdpy_remove, + .show_description = mdpy_show_description, +}; + +static const struct file_operations vd_fops = { + .owner = THIS_MODULE, +}; + +static void mdpy_device_release(struct device *dev) +{ + /* nothing */ +} + +static int __init mdpy_dev_init(void) +{ + int ret = 0; + + ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK + 1, MDPY_NAME); + if (ret < 0) { + pr_err("Error: failed to register mdpy_dev, err: %d\n", ret); + return ret; + } + cdev_init(&mdpy_cdev, &vd_fops); + cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1); + pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt)); + + ret = mdev_register_driver(&mdpy_driver); + if (ret) + goto err_cdev; + + ret = class_register(&mdpy_class); + if (ret) + goto err_driver; + mdpy_dev.class = &mdpy_class; + mdpy_dev.release = mdpy_device_release; + dev_set_name(&mdpy_dev, "%s", MDPY_NAME); + + ret = device_register(&mdpy_dev); + if (ret) + goto err_put; + + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, + mdpy_mdev_types, + ARRAY_SIZE(mdpy_mdev_types)); + if (ret) + goto err_device; + + return 0; + +err_device: + device_del(&mdpy_dev); +err_put: + put_device(&mdpy_dev); + class_unregister(&mdpy_class); +err_driver: + mdev_unregister_driver(&mdpy_driver); +err_cdev: + cdev_del(&mdpy_cdev); + unregister_chrdev_region(mdpy_devt, MINORMASK + 1); + return ret; +} + +static void __exit mdpy_dev_exit(void) +{ + mdpy_dev.bus = NULL; + mdev_unregister_parent(&mdpy_parent); + + device_unregister(&mdpy_dev); + mdev_unregister_driver(&mdpy_driver); + cdev_del(&mdpy_cdev); + unregister_chrdev_region(mdpy_devt, MINORMASK + 1); + class_unregister(&mdpy_class); +} + +module_param_named(count, mdpy_driver.max_instances, int, 0444); +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); + +module_init(mdpy_dev_init) +module_exit(mdpy_dev_exit) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index ca495686b9c3..bd92c38379b8 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Mediated virtual PCI serial host device driver * @@ -5,18 +6,12 @@ * Author: Neo Jia <cjia@nvidia.com> * Kirti Wankhede <kwankhede@nvidia.com> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * * Sample driver that creates mdev device that simulates serial port over PCI * card. - * */ #include <linux/init.h> #include <linux/module.h> -#include <linux/device.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/poll.h> @@ -24,7 +19,6 @@ #include <linux/cdev.h> #include <linux/sched.h> #include <linux/wait.h> -#include <linux/uuid.h> #include <linux/vfio.h> #include <linux/iommu.h> #include <linux/sysfs.h> @@ -35,6 +29,8 @@ #include <linux/serial.h> #include <uapi/linux/serial_reg.h> #include <linux/eventfd.h> +#include <linux/anon_inodes.h> + /* * #defines */ @@ -72,12 +68,13 @@ * Global Structures */ -struct mtty_dev { +static struct mtty_dev { dev_t vd_devt; struct class *vd_class; struct cdev vd_cdev; struct idr vd_idr; struct device dev; + struct mdev_parent parent; } mtty_dev; struct mdev_region_info { @@ -88,7 +85,7 @@ struct mdev_region_info { }; #if defined(DEBUG_REGS) -const char *wr_reg[] = { +static const char *wr_reg[] = { "TX", "IER", "FCR", @@ -99,7 +96,7 @@ const char *wr_reg[] = { "SCR" }; -const char *rd_reg[] = { +static const char *rd_reg[] = { "RX", "IER", "IIR", @@ -129,9 +126,32 @@ struct serial_port { u8 intr_trigger_level; /* interrupt trigger level */ }; +struct mtty_data { + u64 magic; +#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */ + u32 major_ver; +#define MTTY_MAJOR_VER 1 + u32 minor_ver; +#define MTTY_MINOR_VER 0 + u32 nr_ports; + u32 flags; + struct serial_port ports[2]; +}; + +struct mdev_state; + +struct mtty_migration_file { + struct file *filp; + struct mutex lock; + struct mdev_state *mdev_state; + struct mtty_data data; + ssize_t filled_size; + u8 disabled:1; +}; + /* State of each mdev device */ struct mdev_state { - int irq_fd; + struct vfio_device vdev; struct eventfd_ctx *intx_evtfd; struct eventfd_ctx *msi_evtfd; int irq_index; @@ -145,33 +165,41 @@ struct mdev_state { struct mutex rxtx_lock; struct vfio_device_info dev_info; int nr_ports; + enum vfio_device_mig_state state; + struct mutex state_mutex; + struct mutex reset_mutex; + struct mtty_migration_file *saving_migf; + struct mtty_migration_file *resuming_migf; + u8 deferred_reset:1; + u8 intx_mask:1; +}; + +static struct mtty_type { + struct mdev_type type; + int nr_ports; +} mtty_types[2] = { + { .nr_ports = 1, .type.sysfs_name = "1", + .type.pretty_name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", + .type.pretty_name = "Dual port serial" }, +}; + +static struct mdev_type *mtty_mdev_types[] = { + &mtty_types[0].type, + &mtty_types[1].type, }; -struct mutex mdev_list_lock; -struct list_head mdev_devices_list; +static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS); static const struct file_operations vd_fops = { .owner = THIS_MODULE, }; -/* function prototypes */ - -static int mtty_trigger_interrupt(uuid_le uuid); +static const struct vfio_device_ops mtty_dev_ops; /* Helper functions */ -static struct mdev_state *find_mdev_state_by_uuid(uuid_le uuid) -{ - struct mdev_state *mds; - - list_for_each_entry(mds, &mdev_devices_list, next) { - if (uuid_le_cmp(mdev_uuid(mds->mdev), uuid) == 0) - return mds; - } - - return NULL; -} -void dump_buffer(char *buf, uint32_t count) +static void dump_buffer(u8 *buf, uint32_t count) { #if defined(DEBUG) int i; @@ -185,6 +213,36 @@ void dump_buffer(char *buf, uint32_t count) #endif } +static bool is_intx(struct mdev_state *mdev_state) +{ + return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX; +} + +static bool is_msi(struct mdev_state *mdev_state) +{ + return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX; +} + +static bool is_noirq(struct mdev_state *mdev_state) +{ + return !is_intx(mdev_state) && !is_msi(mdev_state); +} + +static void mtty_trigger_interrupt(struct mdev_state *mdev_state) +{ + lockdep_assert_held(&mdev_state->ops_lock); + + if (is_msi(mdev_state)) { + if (mdev_state->msi_evtfd) + eventfd_signal(mdev_state->msi_evtfd); + } else if (is_intx(mdev_state)) { + if (mdev_state->intx_evtfd && !mdev_state->intx_mask) { + eventfd_signal(mdev_state->intx_evtfd); + mdev_state->intx_mask = true; + } + } +} + static void mtty_create_config_space(struct mdev_state *mdev_state) { /* PCI dev ID */ @@ -250,7 +308,7 @@ static void mtty_create_config_space(struct mdev_state *mdev_state) } static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, - char *buf, u32 count) + u8 *buf, u32 count) { u32 cfg_addr, bar_mask, bar_index = 0; @@ -304,7 +362,7 @@ static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, } static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, - u16 offset, char *buf, u32 count) + u16 offset, u8 *buf, u32 count) { u8 data = *buf; @@ -341,8 +399,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, pr_err("Serial port %d: Fifo level trigger\n", index); #endif - mtty_trigger_interrupt( - mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } } else { #if defined(DEBUG_INTR) @@ -356,8 +413,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, */ if (mdev_state->s[index].uart_reg[UART_IER] & UART_IER_RLSI) - mtty_trigger_interrupt( - mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } mutex_unlock(&mdev_state->rxtx_lock); break; @@ -376,8 +432,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, pr_err("Serial port %d: IER_THRI write\n", index); #endif - mtty_trigger_interrupt( - mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } mutex_unlock(&mdev_state->rxtx_lock); @@ -448,7 +503,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, #if defined(DEBUG_INTR) pr_err("Serial port %d: MCR_OUT2 write\n", index); #endif - mtty_trigger_interrupt(mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && @@ -456,7 +511,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, #if defined(DEBUG_INTR) pr_err("Serial port %d: MCR RTS/DTR write\n", index); #endif - mtty_trigger_interrupt(mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } break; @@ -475,7 +530,7 @@ static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, } static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, - u16 offset, char *buf, u32 count) + u16 offset, u8 *buf, u32 count) { /* Handle read requests by guest */ switch (offset) { @@ -507,8 +562,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, #endif if (mdev_state->s[index].uart_reg[UART_IER] & UART_IER_THRI) - mtty_trigger_interrupt( - mdev_uuid(mdev_state->mdev)); + mtty_trigger_interrupt(mdev_state); } mutex_unlock(&mdev_state->rxtx_lock); @@ -534,7 +588,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, /* Interrupt priority 2: Fifo trigger level reached */ if ((ier & UART_IER_RDI) && - (mdev_state->s[index].rxtx.count == + (mdev_state->s[index].rxtx.count >= mdev_state->s[index].intr_trigger_level)) *buf |= UART_IIR_RDI; @@ -570,7 +624,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, u8 lsr = 0; mutex_lock(&mdev_state->rxtx_lock); - /* atleast one char in FIFO */ + /* at least one char in FIFO */ if (mdev_state->s[index].rxtx.head != mdev_state->s[index].rxtx.tail) lsr |= UART_LSR_DR; @@ -650,22 +704,15 @@ static void mdev_read_base(struct mdev_state *mdev_state) } } -static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, +static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count, loff_t pos, bool is_write) { - struct mdev_state *mdev_state; unsigned int index; loff_t offset; int ret = 0; - if (!mdev || !buf) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) { - pr_err("%s mdev_state not found\n", __func__); + if (!buf) return -EINVAL; - } mutex_lock(&mdev_state->ops_lock); @@ -698,7 +745,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, #if defined(DEBUG_REGS) pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n", __func__, index, offset, wr_reg[offset], - (u8)*buf, mdev_state->s[index].dlab); + *buf, mdev_state->s[index].dlab); #endif handle_bar_write(index, mdev_state, offset, buf, count); } else { @@ -708,7 +755,7 @@ static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, #if defined(DEBUG_REGS) pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n", __func__, index, offset, rd_reg[offset], - (u8)*buf, mdev_state->s[index].dlab); + *buf, mdev_state->s[index].dlab); #endif } break; @@ -727,97 +774,651 @@ accessfailed: return ret; } -int mtty_create(struct kobject *kobj, struct mdev_device *mdev) +static size_t mtty_data_size(struct mdev_state *mdev_state) { - struct mdev_state *mdev_state; - char name[MTTY_STRING_LEN]; - int nr_ports = 0, i; + return offsetof(struct mtty_data, ports) + + (mdev_state->nr_ports * sizeof(struct serial_port)); +} + +static void mtty_disable_file(struct mtty_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + migf->filled_size = 0; + migf->filp->f_pos = 0; + mutex_unlock(&migf->lock); +} + +static void mtty_disable_files(struct mdev_state *mdev_state) +{ + if (mdev_state->saving_migf) { + mtty_disable_file(mdev_state->saving_migf); + fput(mdev_state->saving_migf->filp); + mdev_state->saving_migf = NULL; + } - if (!mdev) + if (mdev_state->resuming_migf) { + mtty_disable_file(mdev_state->resuming_migf); + fput(mdev_state->resuming_migf->filp); + mdev_state->resuming_migf = NULL; + } +} + +static void mtty_state_mutex_unlock(struct mdev_state *mdev_state) +{ +again: + mutex_lock(&mdev_state->reset_mutex); + if (mdev_state->deferred_reset) { + mdev_state->deferred_reset = false; + mutex_unlock(&mdev_state->reset_mutex); + mdev_state->state = VFIO_DEVICE_STATE_RUNNING; + mtty_disable_files(mdev_state); + goto again; + } + mutex_unlock(&mdev_state->state_mutex); + mutex_unlock(&mdev_state->reset_mutex); +} + +static int mtty_release_migf(struct inode *inode, struct file *filp) +{ + struct mtty_migration_file *migf = filp->private_data; + + mtty_disable_file(migf); + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mtty_migration_file *migf = filp->private_data; + struct mdev_state *mdev_state = migf->mdev_state; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info = {}; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) return -EINVAL; - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(mdev_parent_dev(mdev)), i + 1); - if (!strcmp(kobj->name, name)) { - nr_ports = i + 1; - break; + mutex_lock(&mdev_state->state_mutex); + if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY && + mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto unlock; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + mutex_unlock(&migf->lock); + ret = -ENODEV; + goto unlock; + } + + if (*pos > migf->filled_size) { + mutex_unlock(&migf->lock); + ret = -EINVAL; + goto unlock; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->filled_size - *pos; + mutex_unlock(&migf->lock); + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +unlock: + mtty_state_mutex_unlock(mdev_state); + return ret; +} + +static ssize_t mtty_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct mtty_migration_file *migf = filp->private_data; + ssize_t ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + mutex_lock(&migf->lock); + + dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len); + + if (migf->disabled) { + ret = -ENODEV; + goto out_unlock; + } + + if (*pos > migf->filled_size) { + ret = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, migf->filled_size - *pos, len); + if (len) { + if (copy_to_user(buf, (void *)&migf->data + *pos, len)) { + ret = -EFAULT; + goto out_unlock; } + *pos += len; + ret = len; + } +out_unlock: + dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret); + mutex_unlock(&migf->lock); + return ret; +} + +static const struct file_operations mtty_save_fops = { + .owner = THIS_MODULE, + .read = mtty_save_read, + .unlocked_ioctl = mtty_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = mtty_release_migf, +}; + +static void mtty_save_state(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf = mdev_state->saving_migf; + int i; + + mutex_lock(&migf->lock); + for (i = 0; i < mdev_state->nr_ports; i++) { + memcpy(&migf->data.ports[i], + &mdev_state->s[i], sizeof(struct serial_port)); + migf->filled_size += sizeof(struct serial_port); + } + dev_dbg(mdev_state->vdev.dev, + "%s filled to %zu\n", __func__, migf->filled_size); + mutex_unlock(&migf->lock); +} + +static int mtty_load_state(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf = mdev_state->resuming_migf; + int i; + + mutex_lock(&migf->lock); + /* magic and version already tested by resume write fn */ + if (migf->filled_size < mtty_data_size(mdev_state)) { + dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n", + __func__, mtty_data_size(mdev_state), + migf->filled_size); + mutex_unlock(&migf->lock); + return -EINVAL; } - if (!nr_ports) + for (i = 0; i < mdev_state->nr_ports; i++) + memcpy(&mdev_state->s[i], + &migf->data.ports[i], sizeof(struct serial_port)); + + mutex_unlock(&migf->lock); + return 0; +} + +static struct mtty_migration_file * +mtty_save_device_data(struct mdev_state *mdev_state, + enum vfio_device_mig_state state) +{ + struct mtty_migration_file *migf = mdev_state->saving_migf; + struct mtty_migration_file *ret = NULL; + + if (migf) { + if (state == VFIO_DEVICE_STATE_STOP_COPY) + goto fill_data; + return ret; + } + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops, + migf, O_RDONLY); + if (IS_ERR(migf->filp)) { + int rc = PTR_ERR(migf->filp); + + kfree(migf); + return ERR_PTR(rc); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + migf->mdev_state = mdev_state; + + migf->data.magic = MTTY_MAGIC; + migf->data.major_ver = MTTY_MAJOR_VER; + migf->data.minor_ver = MTTY_MINOR_VER; + migf->data.nr_ports = mdev_state->nr_ports; + + migf->filled_size = offsetof(struct mtty_data, ports); + + dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n", + __func__, migf->filled_size); + + ret = mdev_state->saving_migf = migf; + +fill_data: + if (state == VFIO_DEVICE_STATE_STOP_COPY) + mtty_save_state(mdev_state); + + return ret; +} + +static ssize_t mtty_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct mtty_migration_file *migf = filp->private_data; + struct mdev_state *mdev_state = migf->mdev_state; + loff_t requested_length; + ssize_t ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) + if (requested_length > mtty_data_size(mdev_state)) return -ENOMEM; - mdev_state->nr_ports = nr_ports; + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out_unlock; + } + + if (copy_from_user((void *)&migf->data + *pos, buf, len)) { + ret = -EFAULT; + goto out_unlock; + } + + *pos += len; + ret = len; + + dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n", + __func__, len, migf->filled_size + len); + + if (migf->filled_size < offsetof(struct mtty_data, ports) && + migf->filled_size + len >= offsetof(struct mtty_data, ports)) { + if (migf->data.magic != MTTY_MAGIC || migf->data.flags || + migf->data.major_ver != MTTY_MAJOR_VER || + migf->data.minor_ver != MTTY_MINOR_VER || + migf->data.nr_ports != mdev_state->nr_ports) { + dev_dbg(migf->mdev_state->vdev.dev, + "%s failed validation\n", __func__); + ret = -EFAULT; + } else { + dev_dbg(migf->mdev_state->vdev.dev, + "%s header validated\n", __func__); + } + } + + migf->filled_size += len; + +out_unlock: + mutex_unlock(&migf->lock); + return ret; +} + +static const struct file_operations mtty_resume_fops = { + .owner = THIS_MODULE, + .write = mtty_resume_write, + .release = mtty_release_migf, +}; + +static struct mtty_migration_file * +mtty_resume_device_data(struct mdev_state *mdev_state) +{ + struct mtty_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops, + migf, O_WRONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + migf->mdev_state = mdev_state; + + mdev_state->resuming_migf = migf; + + return migf; +} + +static struct file *mtty_step_state(struct mdev_state *mdev_state, + enum vfio_device_mig_state new) +{ + enum vfio_device_mig_state cur = mdev_state->state; + + dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new); + + /* + * The following state transitions are no-op considering + * mtty does not do DMA nor require any explicit start/stop. + * + * RUNNING -> RUNNING_P2P + * RUNNING_P2P -> RUNNING + * RUNNING_P2P -> STOP + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * STOP -> RUNNING_P2P + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && + new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + (new == VFIO_DEVICE_STATE_RUNNING || + new == VFIO_DEVICE_STATE_STOP)) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + /* + * The following state transitions simply close migration files, + * with the exception of RESUMING -> STOP, which needs to load + * the state first. + * + * RESUMING -> STOP + * PRE_COPY -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P + * STOP_COPY -> STOP + */ + if (cur == VFIO_DEVICE_STATE_RESUMING && + new == VFIO_DEVICE_STATE_STOP) { + int ret; + + ret = mtty_load_state(mdev_state); + if (ret) + return ERR_PTR(ret); + mtty_disable_files(mdev_state); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && + new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_STOP_COPY && + new == VFIO_DEVICE_STATE_STOP)) { + mtty_disable_files(mdev_state); + return NULL; + } + + /* + * The following state transitions return migration files. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * STOP -> STOP_COPY + * STOP -> RESUMING + * PRE_COPY_P2P -> STOP_COPY + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && + new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P) || + (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_STOP_COPY) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_STOP_COPY)) { + struct mtty_migration_file *migf; + + migf = mtty_save_device_data(mdev_state, new); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + if (migf) { + get_file(migf->filp); + + return migf->filp; + } + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && + new == VFIO_DEVICE_STATE_RESUMING) { + struct mtty_migration_file *migf; + + migf = mtty_resume_device_data(mdev_state); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + get_file(migf->filp); + + return migf->filp; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static struct file *mtty_set_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct file *ret = NULL; + + dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state); + + mutex_lock(&mdev_state->state_mutex); + while (mdev_state->state != new_state) { + enum vfio_device_mig_state next_state; + int rc = vfio_mig_get_next_state(vdev, mdev_state->state, + new_state, &next_state); + if (rc) { + ret = ERR_PTR(rc); + break; + } + + ret = mtty_step_state(mdev_state, next_state); + if (IS_ERR(ret)) + break; + + mdev_state->state = next_state; + + if (WARN_ON(ret && new_state != next_state)) { + fput(ret); + ret = ERR_PTR(-EINVAL); + break; + } + } + mtty_state_mutex_unlock(mdev_state); + return ret; +} + +static int mtty_get_state(struct vfio_device *vdev, + enum vfio_device_mig_state *current_state) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + mutex_lock(&mdev_state->state_mutex); + *current_state = mdev_state->state; + mtty_state_mutex_unlock(mdev_state); + return 0; +} + +static int mtty_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + *stop_copy_length = mtty_data_size(mdev_state); + return 0; +} + +static const struct vfio_migration_ops mtty_migration_ops = { + .migration_set_state = mtty_set_state, + .migration_get_state = mtty_get_state, + .migration_get_data_size = mtty_get_data_size, +}; + +static int mtty_log_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, + u32 nnodes, u64 *page_size) +{ + return 0; +} + +static int mtty_log_stop(struct vfio_device *vdev) +{ + return 0; +} + +static int mtty_log_read_and_clear(struct vfio_device *vdev, + unsigned long iova, unsigned long length, + struct iova_bitmap *dirty) +{ + return 0; +} + +static const struct vfio_log_ops mtty_log_ops = { + .log_start = mtty_log_start, + .log_stop = mtty_log_stop, + .log_read_and_clear = mtty_log_read_and_clear, +}; + +static int mtty_init_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); + struct mtty_type *type = + container_of(mdev->type, struct mtty_type, type); + int avail_ports = atomic_read(&mdev_avail_ports); + int ret; + + do { + if (avail_ports < type->nr_ports) + return -ENOSPC; + } while (!atomic_try_cmpxchg(&mdev_avail_ports, + &avail_ports, + avail_ports - type->nr_ports)); + + mdev_state->nr_ports = type->nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; mutex_init(&mdev_state->rxtx_lock); - mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { - kfree(mdev_state); - return -ENOMEM; + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) { + ret = -ENOMEM; + goto err_nr_ports; } mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_set_drvdata(mdev, mdev_state); - mtty_create_config_space(mdev_state); - mutex_lock(&mdev_list_lock); - list_add(&mdev_state->next, &mdev_devices_list); - mutex_unlock(&mdev_list_lock); + mutex_init(&mdev_state->state_mutex); + mutex_init(&mdev_state->reset_mutex); + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + vdev->mig_ops = &mtty_migration_ops; + vdev->log_ops = &mtty_log_ops; + mdev_state->state = VFIO_DEVICE_STATE_RUNNING; return 0; + +err_nr_ports: + atomic_add(type->nr_ports, &mdev_avail_ports); + return ret; } -int mtty_remove(struct mdev_device *mdev) +static int mtty_probe(struct mdev_device *mdev) { - struct mdev_state *mds, *tmp_mds; - struct mdev_state *mdev_state = mdev_get_drvdata(mdev); - int ret = -EINVAL; - - mutex_lock(&mdev_list_lock); - list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) { - if (mdev_state == mds) { - list_del(&mdev_state->next); - mdev_set_drvdata(mdev, NULL); - kfree(mdev_state->vconfig); - kfree(mdev_state); - ret = 0; - break; - } - } - mutex_unlock(&mdev_list_lock); + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mtty_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); return ret; } -int mtty_reset(struct mdev_device *mdev) +static void mtty_release_dev(struct vfio_device *vdev) { - struct mdev_state *mdev_state; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); - if (!mdev) - return -EINVAL; + mutex_destroy(&mdev_state->reset_mutex); + mutex_destroy(&mdev_state->state_mutex); + atomic_add(mdev_state->nr_ports, &mdev_avail_ports); + kfree(mdev_state->vconfig); +} - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; +static void mtty_remove(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); + + vfio_unregister_group_dev(&mdev_state->vdev); + vfio_put_device(&mdev_state->vdev); +} +static int mtty_reset(struct mdev_state *mdev_state) +{ pr_info("%s: called\n", __func__); + mutex_lock(&mdev_state->reset_mutex); + mdev_state->deferred_reset = true; + if (!mutex_trylock(&mdev_state->state_mutex)) { + mutex_unlock(&mdev_state->reset_mutex); + return 0; + } + mutex_unlock(&mdev_state->reset_mutex); + mtty_state_mutex_unlock(mdev_state); + return 0; } -ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count, - loff_t *ppos) +static ssize_t mtty_read(struct vfio_device *vdev, char __user *buf, + size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -827,7 +1428,7 @@ ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count, if (count >= 4 && !(*ppos % 4)) { u32 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -839,7 +1440,7 @@ ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count, } else if (count >= 2 && !(*ppos % 2)) { u16 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -851,7 +1452,7 @@ ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, size_t count, } else { u8 val; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, false); if (ret <= 0) goto read_err; @@ -874,9 +1475,11 @@ read_err: return -EFAULT; } -ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, +static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int done = 0; int ret; @@ -889,7 +1492,7 @@ ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -901,7 +1504,7 @@ ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -913,7 +1516,7 @@ ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, if (copy_from_user(&val, buf, sizeof(val))) goto write_err; - ret = mdev_access(mdev, (char *)&val, sizeof(val), + ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val), *ppos, true); if (ret <= 0) goto write_err; @@ -931,78 +1534,143 @@ write_err: return -EFAULT; } -static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, +static void mtty_disable_intx(struct mdev_state *mdev_state) +{ + if (mdev_state->intx_evtfd) { + eventfd_ctx_put(mdev_state->intx_evtfd); + mdev_state->intx_evtfd = NULL; + mdev_state->intx_mask = false; + mdev_state->irq_index = -1; + } +} + +static void mtty_disable_msi(struct mdev_state *mdev_state) +{ + if (mdev_state->msi_evtfd) { + eventfd_ctx_put(mdev_state->msi_evtfd); + mdev_state->msi_evtfd = NULL; + mdev_state->irq_index = -1; + } +} + +static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, unsigned int index, unsigned int start, unsigned int count, void *data) { int ret = 0; - struct mdev_state *mdev_state; - - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; mutex_lock(&mdev_state->ops_lock); switch (index) { case VFIO_PCI_INTX_IRQ_INDEX: switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { case VFIO_IRQ_SET_ACTION_MASK: + if (!is_intx(mdev_state) || start != 0 || count != 1) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mdev_state->intx_mask = true; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + + if (mask) + mdev_state->intx_mask = true; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + ret = -ENOTTY; /* No support for mask fd */ + } + break; case VFIO_IRQ_SET_ACTION_UNMASK: + if (!is_intx(mdev_state) || start != 0 || count != 1) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mdev_state->intx_mask = false; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + + if (mask) + mdev_state->intx_mask = false; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + ret = -ENOTTY; /* No support for unmask fd */ + } break; case VFIO_IRQ_SET_ACTION_TRIGGER: - { - if (flags & VFIO_IRQ_SET_DATA_NONE) { - pr_info("%s: disable INTx\n", __func__); - if (mdev_state->intx_evtfd) - eventfd_ctx_put(mdev_state->intx_evtfd); + if (is_intx(mdev_state) && !count && + (flags & VFIO_IRQ_SET_DATA_NONE)) { + mtty_disable_intx(mdev_state); + break; + } + + if (!(is_intx(mdev_state) || is_noirq(mdev_state)) || + start != 0 || count != 1) { + ret = -EINVAL; break; } if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { int fd = *(int *)data; + struct eventfd_ctx *evt; + + mtty_disable_intx(mdev_state); + + if (fd < 0) + break; - if (fd > 0) { - struct eventfd_ctx *evt; - - evt = eventfd_ctx_fdget(fd); - if (IS_ERR(evt)) { - ret = PTR_ERR(evt); - break; - } - mdev_state->intx_evtfd = evt; - mdev_state->irq_fd = fd; - mdev_state->irq_index = index; + evt = eventfd_ctx_fdget(fd); + if (IS_ERR(evt)) { + ret = PTR_ERR(evt); break; } + mdev_state->intx_evtfd = evt; + mdev_state->irq_index = index; + break; + } + + if (!is_intx(mdev_state)) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mtty_trigger_interrupt(mdev_state); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + + if (trigger) + mtty_trigger_interrupt(mdev_state); } break; } - } break; case VFIO_PCI_MSI_IRQ_INDEX: switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { case VFIO_IRQ_SET_ACTION_MASK: case VFIO_IRQ_SET_ACTION_UNMASK: + ret = -ENOTTY; break; case VFIO_IRQ_SET_ACTION_TRIGGER: - if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (mdev_state->msi_evtfd) - eventfd_ctx_put(mdev_state->msi_evtfd); - pr_info("%s: disable MSI\n", __func__); - mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX; + if (is_msi(mdev_state) && !count && + (flags & VFIO_IRQ_SET_DATA_NONE)) { + mtty_disable_msi(mdev_state); + break; + } + + if (!(is_msi(mdev_state) || is_noirq(mdev_state)) || + start != 0 || count != 1) { + ret = -EINVAL; break; } + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { int fd = *(int *)data; struct eventfd_ctx *evt; - if (fd <= 0) - break; + mtty_disable_msi(mdev_state); - if (mdev_state->msi_evtfd) + if (fd < 0) break; evt = eventfd_ctx_fdget(fd); @@ -1011,20 +1679,37 @@ static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, break; } mdev_state->msi_evtfd = evt; - mdev_state->irq_fd = fd; mdev_state->irq_index = index; + break; + } + + if (!is_msi(mdev_state)) { + ret = -EINVAL; + break; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + mtty_trigger_interrupt(mdev_state); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + + if (trigger) + mtty_trigger_interrupt(mdev_state); } break; - } - break; + } + break; case VFIO_PCI_MSIX_IRQ_INDEX: - pr_info("%s: MSIX_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__); + ret = -ENOTTY; break; case VFIO_PCI_ERR_IRQ_INDEX: - pr_info("%s: ERR_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__); + ret = -ENOTTY; break; case VFIO_PCI_REQ_IRQ_INDEX: - pr_info("%s: REQ_IRQ\n", __func__); + dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__); + ret = -ENOTTY; break; } @@ -1032,56 +1717,15 @@ static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, return ret; } -static int mtty_trigger_interrupt(uuid_le uuid) -{ - int ret = -1; - struct mdev_state *mdev_state; - - mdev_state = find_mdev_state_by_uuid(uuid); - - if (!mdev_state) { - pr_info("%s: mdev not found\n", __func__); - return -EINVAL; - } - - if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) && - (!mdev_state->msi_evtfd)) - return -EINVAL; - else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) && - (!mdev_state->intx_evtfd)) { - pr_info("%s: Intr eventfd not found\n", __func__); - return -EINVAL; - } - - if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) - ret = eventfd_signal(mdev_state->msi_evtfd, 1); - else - ret = eventfd_signal(mdev_state->intx_evtfd, 1); - -#if defined(DEBUG_INTR) - pr_info("Intx triggered\n"); -#endif - if (ret != 1) - pr_err("%s: eventfd signal failed (%d)\n", __func__, ret); - - return ret; -} - -int mtty_get_region_info(struct mdev_device *mdev, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int size = 0; - struct mdev_state *mdev_state; u32 bar_index; - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -EINVAL; - bar_index = region_info->index; if (bar_index >= VFIO_PCI_NUM_REGIONS) return -EINVAL; @@ -1116,32 +1760,25 @@ int mtty_get_region_info(struct mdev_device *mdev, return 0; } -int mtty_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info) +static int mtty_get_irq_info(struct vfio_irq_info *irq_info) { - switch (irq_info->index) { - case VFIO_PCI_INTX_IRQ_INDEX: - case VFIO_PCI_MSI_IRQ_INDEX: - case VFIO_PCI_REQ_IRQ_INDEX: - break; - - default: + if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX && + irq_info->index != VFIO_PCI_MSI_IRQ_INDEX) return -EINVAL; - } irq_info->flags = VFIO_IRQ_INFO_EVENTFD; irq_info->count = 1; if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX) - irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); + irq_info->flags |= VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED; else irq_info->flags |= VFIO_IRQ_INFO_NORESIZE; return 0; } -int mtty_get_device_info(struct mdev_device *mdev, - struct vfio_device_info *dev_info) +static int mtty_get_device_info(struct vfio_device_info *dev_info) { dev_info->flags = VFIO_DEVICE_FLAGS_PCI; dev_info->num_regions = VFIO_PCI_NUM_REGIONS; @@ -1150,19 +1787,13 @@ int mtty_get_device_info(struct mdev_device *mdev, return 0; } -static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, +static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); int ret = 0; unsigned long minsz; - struct mdev_state *mdev_state; - - if (!mdev) - return -EINVAL; - - mdev_state = mdev_get_drvdata(mdev); - if (!mdev_state) - return -ENODEV; switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -1177,7 +1808,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, if (info.argsz < minsz) return -EINVAL; - ret = mtty_get_device_info(mdev, &info); + ret = mtty_get_device_info(&info); if (ret) return ret; @@ -1188,30 +1819,6 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1226,7 +1833,7 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, (info.index >= mdev_state->dev_info.num_irqs)) return -EINVAL; - ret = mtty_get_irq_info(mdev, &info); + ret = mtty_get_irq_info(&info); if (ret) return ret; @@ -1260,61 +1867,23 @@ static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, return PTR_ERR(data); } - ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start, + ret = mtty_set_irqs(mdev_state, hdr.flags, hdr.index, hdr.start, hdr.count, data); kfree(ptr); return ret; } case VFIO_DEVICE_RESET: - return mtty_reset(mdev); + return mtty_reset(mdev_state); } return -ENOTTY; } -int mtty_open(struct mdev_device *mdev) -{ - pr_info("%s\n", __func__); - return 0; -} - -void mtty_close(struct mdev_device *mdev) -{ - pr_info("%s\n", __func__); -} - -static ssize_t -sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "This is phy device\n"); -} - -static DEVICE_ATTR_RO(sample_mtty_dev); - -static struct attribute *mtty_dev_attrs[] = { - &dev_attr_sample_mtty_dev.attr, - NULL, -}; - -static const struct attribute_group mtty_dev_group = { - .name = "mtty_dev", - .attrs = mtty_dev_attrs, -}; - -const struct attribute_group *mtty_dev_groups[] = { - &mtty_dev_group, - NULL, -}; - static ssize_t sample_mdev_dev_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (mdev_from_dev(dev)) - return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); - - return sprintf(buf, "\n"); + return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); } static DEVICE_ATTR_RO(sample_mdev_dev); @@ -1329,102 +1898,54 @@ static const struct attribute_group mdev_dev_group = { .attrs = mdev_dev_attrs, }; -const struct attribute_group *mdev_dev_groups[] = { +static const struct attribute_group *mdev_dev_groups[] = { &mdev_dev_group, NULL, }; -static ssize_t -name_show(struct kobject *kobj, struct device *dev, char *buf) +static unsigned int mtty_get_available(struct mdev_type *mtype) { - char name[MTTY_STRING_LEN]; - int i; - const char *name_str[2] = {"Single port serial", "Dual port serial"}; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(dev), i + 1); - if (!strcmp(kobj->name, name)) - return sprintf(buf, "%s\n", name_str[i]); - } - - return -EINVAL; + return atomic_read(&mdev_avail_ports) / type->nr_ports; } -MDEV_TYPE_ATTR_RO(name); - -static ssize_t -available_instances_show(struct kobject *kobj, struct device *dev, char *buf) +static void mtty_close(struct vfio_device *vdev) { - char name[MTTY_STRING_LEN]; - int i; - struct mdev_state *mds; - int ports = 0, used = 0; - - for (i = 0; i < 2; i++) { - snprintf(name, MTTY_STRING_LEN, "%s-%d", - dev_driver_string(dev), i + 1); - if (!strcmp(kobj->name, name)) { - ports = i + 1; - break; - } - } + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); - if (!ports) - return -EINVAL; - - list_for_each_entry(mds, &mdev_devices_list, next) - used += mds->nr_ports; - - return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports); -} - -MDEV_TYPE_ATTR_RO(available_instances); - - -static ssize_t device_api_show(struct kobject *kobj, struct device *dev, - char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); + mtty_disable_files(mdev_state); + mtty_disable_intx(mdev_state); + mtty_disable_msi(mdev_state); } -MDEV_TYPE_ATTR_RO(device_api); - -static struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, - &mdev_type_attr_available_instances.attr, - NULL, -}; - -static struct attribute_group mdev_type_group1 = { - .name = "1", - .attrs = mdev_types_attrs, +static const struct vfio_device_ops mtty_dev_ops = { + .name = "vfio-mtty", + .init = mtty_init_dev, + .release = mtty_release_dev, + .read = mtty_read, + .write = mtty_write, + .ioctl = mtty_ioctl, + .get_region_info_caps = mtty_ioctl_get_region_info, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, + .close_device = mtty_close, }; -static struct attribute_group mdev_type_group2 = { - .name = "2", - .attrs = mdev_types_attrs, -}; - -struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - NULL, -}; - -struct mdev_parent_ops mdev_fops = { - .owner = THIS_MODULE, - .dev_attr_groups = mtty_dev_groups, - .mdev_attr_groups = mdev_dev_groups, - .supported_type_groups = mdev_type_groups, - .create = mtty_create, - .remove = mtty_remove, - .open = mtty_open, - .release = mtty_close, - .read = mtty_read, - .write = mtty_write, - .ioctl = mtty_ioctl, +static struct mdev_driver mtty_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, + .driver = { + .name = "mtty", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + .dev_groups = mdev_dev_groups, + }, + .probe = mtty_probe, + .remove = mtty_remove, + .get_available = mtty_get_available, }; static void mtty_device_release(struct device *dev) @@ -1442,7 +1963,8 @@ static int __init mtty_dev_init(void) idr_init(&mtty_dev.vd_idr); - ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK, MTTY_NAME); + ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK + 1, + MTTY_NAME); if (ret < 0) { pr_err("Error: failed to register mtty_dev, err:%d\n", ret); @@ -1450,16 +1972,20 @@ static int __init mtty_dev_init(void) } cdev_init(&mtty_dev.vd_cdev, &vd_fops); - cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK); + cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK + 1); pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt)); - mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME); + ret = mdev_register_driver(&mtty_driver); + if (ret) + goto err_cdev; + + mtty_dev.vd_class = class_create(MTTY_CLASS_NAME); if (IS_ERR(mtty_dev.vd_class)) { pr_err("Error: failed to register mtty_dev class\n"); ret = PTR_ERR(mtty_dev.vd_class); - goto failed1; + goto err_driver; } mtty_dev.dev.class = mtty_dev.vd_class; @@ -1468,40 +1994,38 @@ static int __init mtty_dev_init(void) ret = device_register(&mtty_dev.dev); if (ret) - goto failed2; + goto err_put; - ret = mdev_register_device(&mtty_dev.dev, &mdev_fops); + ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, + &mtty_driver, mtty_mdev_types, + ARRAY_SIZE(mtty_mdev_types)); if (ret) - goto failed3; - - mutex_init(&mdev_list_lock); - INIT_LIST_HEAD(&mdev_devices_list); - - goto all_done; - -failed3: + goto err_device; + return 0; - device_unregister(&mtty_dev.dev); -failed2: +err_device: + device_del(&mtty_dev.dev); +err_put: + put_device(&mtty_dev.dev); class_destroy(mtty_dev.vd_class); - -failed1: +err_driver: + mdev_unregister_driver(&mtty_driver); +err_cdev: cdev_del(&mtty_dev.vd_cdev); - unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK); - -all_done: + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); return ret; } static void __exit mtty_dev_exit(void) { mtty_dev.dev.bus = NULL; - mdev_unregister_device(&mtty_dev.dev); + mdev_unregister_parent(&mtty_dev.parent); device_unregister(&mtty_dev.dev); idr_destroy(&mtty_dev.vd_idr); + mdev_unregister_driver(&mtty_driver); cdev_del(&mtty_dev.vd_cdev); - unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK); + unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); class_destroy(mtty_dev.vd_class); mtty_dev.vd_class = NULL; pr_info("mtty_dev: Unloaded!\n"); @@ -1511,6 +2035,6 @@ module_init(mtty_dev_init) module_exit(mtty_dev_exit) MODULE_LICENSE("GPL v2"); -MODULE_INFO(supported, "Test driver that simulate serial port over PCI"); +MODULE_DESCRIPTION("Test driver that simulate serial port over PCI"); MODULE_VERSION(VERSION_STRING); MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore new file mode 100644 index 000000000000..8708341bc082 --- /dev/null +++ b/samples/vfs/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +/test-fsmount +/test-list-all-mounts +/test-statx +/mountinfo diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile new file mode 100644 index 000000000000..9256ca5d762b --- /dev/null +++ b/samples/vfs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts + +userccflags += -I $(srctree)/tools/testing/selftests/ +userccflags += -I usr/include diff --git a/samples/vfs/mountinfo.c b/samples/vfs/mountinfo.c new file mode 100644 index 000000000000..bc78275cac69 --- /dev/null +++ b/samples/vfs/mountinfo.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Use pidfds, nsfds, listmount() and statmount() mimic the + * contents of /proc/self/mountinfo. + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <stdio.h> +#include <stdint.h> +#include <unistd.h> +#include <alloca.h> +#include <getopt.h> +#include <stdlib.h> +#include <stdbool.h> +#include <errno.h> + +#include "samples-vfs.h" + +/* max mounts per listmount call */ +#define MAXMOUNTS 1024 + +/* size of struct statmount (including trailing string buffer) */ +#define STATMOUNT_BUFSIZE 4096 + +static bool ext_format; + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open -1 +#endif + +/* + * There are no bindings in glibc for listmount() and statmount() (yet), + * make our own here. + */ +static int statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, + struct statmount *buf, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = mask, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_statmount, &req, buf, bufsize, flags); +} + +static ssize_t listmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 last_mnt_id, + __u64 list[], size_t num, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = last_mnt_id, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_listmount, &req, list, num, flags); +} + +static void show_mnt_attrs(__u64 flags) +{ + printf("%s", flags & MOUNT_ATTR_RDONLY ? "ro" : "rw"); + + if (flags & MOUNT_ATTR_NOSUID) + printf(",nosuid"); + if (flags & MOUNT_ATTR_NODEV) + printf(",nodev"); + if (flags & MOUNT_ATTR_NOEXEC) + printf(",noexec"); + + switch (flags & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_RELATIME: + printf(",relatime"); + break; + case MOUNT_ATTR_NOATIME: + printf(",noatime"); + break; + case MOUNT_ATTR_STRICTATIME: + /* print nothing */ + break; + } + + if (flags & MOUNT_ATTR_NODIRATIME) + printf(",nodiratime"); + if (flags & MOUNT_ATTR_NOSYMFOLLOW) + printf(",nosymfollow"); + if (flags & MOUNT_ATTR_IDMAP) + printf(",idmapped"); +} + +static void show_propagation(struct statmount *sm) +{ + if (sm->mnt_propagation & MS_SHARED) + printf(" shared:%llu", sm->mnt_peer_group); + if (sm->mnt_propagation & MS_SLAVE) { + printf(" master:%llu", sm->mnt_master); + if (sm->propagate_from && sm->propagate_from != sm->mnt_master) + printf(" propagate_from:%llu", sm->propagate_from); + } + if (sm->mnt_propagation & MS_UNBINDABLE) + printf(" unbindable"); +} + +static void show_sb_flags(__u64 flags) +{ + printf("%s", flags & MS_RDONLY ? "ro" : "rw"); + if (flags & MS_SYNCHRONOUS) + printf(",sync"); + if (flags & MS_DIRSYNC) + printf(",dirsync"); + if (flags & MS_MANDLOCK) + printf(",mand"); + if (flags & MS_LAZYTIME) + printf(",lazytime"); +} + +static int dump_mountinfo(__u64 mnt_id, __u64 mnt_ns_id) +{ + int ret; + struct statmount *buf = alloca(STATMOUNT_BUFSIZE); + const __u64 mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | + STATMOUNT_PROPAGATE_FROM | STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | + STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE | + STATMOUNT_SB_SOURCE; + + ret = statmount(mnt_id, mnt_ns_id, mask, buf, STATMOUNT_BUFSIZE, 0); + if (ret < 0) { + perror("statmount"); + return 1; + } + + if (ext_format) + printf("0x%llx 0x%llx 0x%llx ", mnt_ns_id, mnt_id, buf->mnt_parent_id); + + printf("%u %u %u:%u %s %s ", buf->mnt_id_old, buf->mnt_parent_id_old, + buf->sb_dev_major, buf->sb_dev_minor, + &buf->str[buf->mnt_root], + &buf->str[buf->mnt_point]); + show_mnt_attrs(buf->mnt_attr); + show_propagation(buf); + + printf(" - %s", &buf->str[buf->fs_type]); + if (buf->mask & STATMOUNT_FS_SUBTYPE) + printf(".%s", &buf->str[buf->fs_subtype]); + if (buf->mask & STATMOUNT_SB_SOURCE) + printf(" %s ", &buf->str[buf->sb_source]); + else + printf(" :none "); + + show_sb_flags(buf->sb_flags); + if (buf->mask & STATMOUNT_MNT_OPTS) + printf(",%s", &buf->str[buf->mnt_opts]); + printf("\n"); + return 0; +} + +static int dump_mounts(__u64 mnt_ns_id) +{ + __u64 mntid[MAXMOUNTS]; + __u64 last_mnt_id = 0; + ssize_t count; + int i; + + /* + * Get a list of all mntids in mnt_ns_id. If it returns MAXMOUNTS + * mounts, then go again until we get everything. + */ + do { + count = listmount(LSMT_ROOT, mnt_ns_id, last_mnt_id, mntid, MAXMOUNTS, 0); + if (count < 0 || count > MAXMOUNTS) { + errno = count < 0 ? errno : count; + perror("listmount"); + return 1; + } + + /* Walk the returned mntids and print info about each */ + for (i = 0; i < count; ++i) { + int ret = dump_mountinfo(mntid[i], mnt_ns_id); + + if (ret != 0) + return ret; + } + /* Set up last_mnt_id to pick up where we left off */ + last_mnt_id = mntid[count - 1]; + } while (count == MAXMOUNTS); + return 0; +} + +static void usage(const char * const prog) +{ + printf("Usage:\n"); + printf("%s [-e] [-p pid] [-r] [-h]\n", prog); + printf(" -e: extended format\n"); + printf(" -h: print usage message\n"); + printf(" -p: get mount namespace from given pid\n"); + printf(" -r: recursively print all mounts in all child namespaces\n"); +} + +int main(int argc, char * const *argv) +{ + struct mnt_ns_info mni = { .size = MNT_NS_INFO_SIZE_VER0 }; + int pidfd, mntns, ret, opt; + pid_t pid = getpid(); + bool recursive = false; + + while ((opt = getopt(argc, argv, "ehp:r")) != -1) { + switch (opt) { + case 'e': + ext_format = true; + break; + case 'h': + usage(argv[0]); + return 0; + case 'p': + pid = atoi(optarg); + break; + case 'r': + recursive = true; + break; + } + } + + /* Get a pidfd for pid */ + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + return 1; + } + + /* Get the mnt namespace for pidfd */ + mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, NULL); + if (mntns < 0) { + perror("PIDFD_GET_MNT_NAMESPACE"); + return 1; + } + close(pidfd); + + /* get info about mntns. In particular, the mnt_ns_id */ + ret = ioctl(mntns, NS_MNT_GET_INFO, &mni); + if (ret < 0) { + perror("NS_MNT_GET_INFO"); + return 1; + } + + do { + int ret; + + ret = dump_mounts(mni.mnt_ns_id); + if (ret) + return ret; + + if (!recursive) + break; + + /* get the next mntns (and overwrite the old mount ns info) */ + ret = ioctl(mntns, NS_MNT_GET_NEXT, &mni); + close(mntns); + mntns = ret; + } while (mntns >= 0); + + return 0; +} diff --git a/samples/vfs/samples-vfs.h b/samples/vfs/samples-vfs.h new file mode 100644 index 000000000000..498baf581b56 --- /dev/null +++ b/samples/vfs/samples-vfs.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SAMPLES_VFS_H +#define __SAMPLES_VFS_H + +#include <errno.h> +#include <linux/types.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> + +#define die_errno(format, ...) \ + do { \ + fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \ + __LINE__, __func__, ##__VA_ARGS__); \ + exit(EXIT_FAILURE); \ + } while (0) + +struct statmount { + __u32 size; /* Total size, including strings */ + __u32 mnt_opts; /* [str] Options (comma separated, escaped) */ + __u64 mask; /* What results were written */ + __u32 sb_dev_major; /* Device ID */ + __u32 sb_dev_minor; + __u64 sb_magic; /* ..._SUPER_MAGIC */ + __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ + __u32 fs_type; /* [str] Filesystem type */ + __u64 mnt_id; /* Unique ID of mount */ + __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ + __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ + __u32 mnt_parent_id_old; + __u64 mnt_attr; /* MOUNT_ATTR_... */ + __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ + __u64 mnt_peer_group; /* ID of shared peer group */ + __u64 mnt_master; /* Mount receives propagation from this ID */ + __u64 propagate_from; /* Propagation from in current namespace */ + __u32 mnt_root; /* [str] Root of mount relative to root of fs */ + __u32 mnt_point; /* [str] Mountpoint relative to current root */ + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ + __u32 sb_source; /* [str] Source string of the mount */ + __u32 opt_num; /* Number of fs options */ + __u32 opt_array; /* [str] Array of nul terminated fs options */ + __u32 opt_sec_num; /* Number of security options */ + __u32 opt_sec_array; /* [str] Array of nul terminated security options */ + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings */ + __u64 __spare2[44]; + char str[]; /* Variable size part containing strings */ +}; + +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; + __u64 mnt_ns_id; +}; + +#ifndef MNT_ID_REQ_SIZE_VER0 +#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#endif + +#ifndef MNT_ID_REQ_SIZE_VER1 +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ +#endif + +/* Get the id for a mount namespace */ +#ifndef NS_GET_MNTNS_ID +#define NS_GET_MNTNS_ID _IO(0xb7, 0x5) +#endif + +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#ifndef MNT_NS_INFO_SIZE_VER0 +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ +#endif + +#ifndef NS_MNT_GET_INFO +#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) +#endif + +#ifndef NS_MNT_GET_NEXT +#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) +#endif + +#ifndef NS_MNT_GET_PREV +#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) +#endif + +#ifndef PIDFD_GET_MNT_NAMESPACE +#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3) +#endif + +#ifndef __NR_listmount +#define __NR_listmount 458 +#endif + +#ifndef __NR_statmount +#define __NR_statmount 457 +#endif + +#ifndef LSMT_ROOT +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ +#endif + +/* @mask bits for statmount(2) */ +#ifndef STATMOUNT_SB_BASIC +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#endif + +#ifndef STATMOUNT_MNT_BASIC +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#endif + +#ifndef STATMOUNT_PROPAGATE_FROM +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#endif + +#ifndef STATMOUNT_MNT_ROOT +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#endif + +#ifndef STATMOUNT_MNT_POINT +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#endif + +#ifndef STATMOUNT_FS_TYPE +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#endif + +#ifndef STATMOUNT_MNT_NS_ID +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#endif + +#ifndef STATMOUNT_MNT_OPTS +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ +#endif + +#ifndef STATMOUNT_FS_SUBTYPE +#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ +#endif + +#ifndef STATMOUNT_SB_SOURCE +#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ +#endif + +#ifndef STATMOUNT_OPT_ARRAY +#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ +#endif + +#ifndef STATMOUNT_OPT_SEC_ARRAY +#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ +#endif + +#ifndef STATX_MNT_ID_UNIQUE +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ +#endif + +#ifndef STATMOUNT_MNT_UIDMAP +#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */ +#endif + +#ifndef STATMOUNT_MNT_GIDMAP +#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */ +#endif + +#ifndef MOUNT_ATTR_RDONLY +#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ +#endif + +#ifndef MOUNT_ATTR_NOSUID +#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */ +#endif + +#ifndef MOUNT_ATTR_NODEV +#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */ +#endif + +#ifndef MOUNT_ATTR_NOEXEC +#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */ +#endif + +#ifndef MOUNT_ATTR__ATIME +#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */ +#endif + +#ifndef MOUNT_ATTR_RELATIME +#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */ +#endif + +#ifndef MOUNT_ATTR_NOATIME +#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ +#endif + +#ifndef MOUNT_ATTR_STRICTATIME +#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ +#endif + +#ifndef MOUNT_ATTR_NODIRATIME +#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#endif + +#ifndef MOUNT_ATTR_IDMAP +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ +#endif + +#ifndef MOUNT_ATTR_NOSYMFOLLOW +#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */ +#endif + +#ifndef MS_RDONLY +#define MS_RDONLY 1 /* Mount read-only */ +#endif + +#ifndef MS_SYNCHRONOUS +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#endif + +#ifndef MS_MANDLOCK +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#endif + +#ifndef MS_DIRSYNC +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#endif + +#ifndef MS_UNBINDABLE +#define MS_UNBINDABLE (1<<17) /* change to unbindable */ +#endif + +#ifndef MS_PRIVATE +#define MS_PRIVATE (1<<18) /* change to private */ +#endif + +#ifndef MS_SLAVE +#define MS_SLAVE (1<<19) /* change to slave */ +#endif + +#ifndef MS_SHARED +#define MS_SHARED (1<<20) /* change to shared */ +#endif + +#ifndef MS_LAZYTIME +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ +#endif + +#endif /* __SAMPLES_VFS_H */ diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c new file mode 100644 index 000000000000..50f47b72e85f --- /dev/null +++ b/samples/vfs/test-fsmount.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* fd-based mount test. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <linux/mount.h> +#include <linux/unistd.h> + +#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0) + +static void check_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf)); + if (n < 0) + break; + n -= 2; + + switch (buf[0]) { + case 'e': + fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2); + break; + case 'w': + fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2); + break; + case 'i': + fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2); + break; + } + } + + errno = err; +} + +static __attribute__((noreturn)) +void mount_error(int fd, const char *s) +{ + check_messages(fd); + fprintf(stderr, "%s: %m\n", s); + exit(1); +} + +/* Hope -1 isn't a syscall */ +#ifndef __NR_fsopen +#define __NR_fsopen -1 +#endif +#ifndef __NR_fsmount +#define __NR_fsmount -1 +#endif +#ifndef __NR_fsconfig +#define __NR_fsconfig -1 +#endif +#ifndef __NR_move_mount +#define __NR_move_mount -1 +#endif + + +static inline int fsopen(const char *fs_name, unsigned int flags) +{ + return syscall(__NR_fsopen, fs_name, flags); +} + +static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags) +{ + return syscall(__NR_fsmount, fsfd, flags, ms_flags); +} + +static inline int fsconfig(int fsfd, unsigned int cmd, + const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux); +} + +static inline int move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, + from_dfd, from_pathname, + to_dfd, to_pathname, flags); +} + +#define E_fsconfig(fd, cmd, key, val, aux) \ + do { \ + if (fsconfig(fd, cmd, key, val, aux) == -1) \ + mount_error(fd, key ?: "create"); \ + } while (0) + +int main(int argc, char *argv[]) +{ + int fsfd, mfd; + + /* Mount a publically available AFS filesystem */ + fsfd = fsopen("afs", 0); + if (fsfd == -1) { + perror("fsopen"); + exit(1); + } + + E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0); + E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); + + mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY); + if (mfd < 0) + mount_error(fsfd, "fsmount"); + E(close(fsfd)); + + if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) { + perror("move_mount"); + exit(1); + } + + E(close(mfd)); + exit(0); +} diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c new file mode 100644 index 000000000000..713c174626aa --- /dev/null +++ b/samples/vfs/test-list-all-mounts.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <errno.h> +#include <limits.h> +#include <linux/types.h> +#include <inttypes.h> +#include <stdio.h> + +#include "../../tools/testing/selftests/pidfd/pidfd.h" +#include "samples-vfs.h" + +static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, + struct statmount *stmnt, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = mask, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_statmount, &req, stmnt, bufsize, flags); +} + +static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id, + __u64 mask, unsigned int flags) +{ + size_t bufsize = 1 << 15; + struct statmount *stmnt = NULL, *tmp = NULL; + int ret; + + for (;;) { + tmp = realloc(stmnt, bufsize); + if (!tmp) + goto out; + + stmnt = tmp; + ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags); + if (!ret) + return stmnt; + + if (errno != EOVERFLOW) + goto out; + + bufsize <<= 1; + if (bufsize >= UINT_MAX / 2) + goto out; + } + +out: + free(stmnt); + return NULL; +} + +static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id, + __u64 list[], size_t num, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = last_mnt_id, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_listmount, &req, list, num, flags); +} + +int main(int argc, char *argv[]) +{ +#define LISTMNT_BUFFER 10 + __u64 list[LISTMNT_BUFFER], last_mnt_id = 0; + int ret, pidfd, fd_mntns; + struct mnt_ns_info info = {}; + + pidfd = sys_pidfd_open(getpid(), 0); + if (pidfd < 0) + die_errno("pidfd_open failed"); + + fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0); + if (fd_mntns < 0) + die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed"); + + ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info); + if (ret < 0) + die_errno("ioctl(NS_GET_MNTNS_ID) failed"); + + printf("Listing %u mounts for mount namespace %" PRIu64 "\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); + for (;;) { + ssize_t nr_mounts; +next: + nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id, + info.mnt_ns_id, list, LISTMNT_BUFFER, + 0); + if (nr_mounts <= 0) { + int fd_mntns_next; + + printf("Finished listing %u mounts for mount namespace %" PRIu64 "\n\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); + fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info); + if (fd_mntns_next < 0) { + if (errno == ENOENT) { + printf("Finished listing all mount namespaces\n"); + exit(0); + } + die_errno("ioctl(NS_MNT_GET_NEXT) failed"); + } + close(fd_mntns); + fd_mntns = fd_mntns_next; + last_mnt_id = 0; + printf("Listing %u mounts for mount namespace %" PRIu64 "\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); + goto next; + } + + for (size_t cur = 0; cur < nr_mounts; cur++) { + struct statmount *stmnt; + + last_mnt_id = list[cur]; + + stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id, + STATMOUNT_SB_BASIC | + STATMOUNT_MNT_BASIC | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | + STATMOUNT_MNT_NS_ID | + STATMOUNT_MNT_OPTS | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_UIDMAP | + STATMOUNT_MNT_GIDMAP, 0); + if (!stmnt) { + printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n", + (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id); + continue; + } + + printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n", + (uint64_t)stmnt->mnt_id, + (uint64_t)stmnt->mnt_parent_id, + (stmnt->mask & STATMOUNT_FS_TYPE) ? stmnt->str + stmnt->fs_type : "", + (stmnt->mask & STATMOUNT_MNT_ROOT) ? stmnt->str + stmnt->mnt_root : "", + (stmnt->mask & STATMOUNT_MNT_POINT) ? stmnt->str + stmnt->mnt_point : "", + (stmnt->mask & STATMOUNT_MNT_OPTS) ? stmnt->str + stmnt->mnt_opts : ""); + + if (stmnt->mask & STATMOUNT_MNT_UIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_uidmap; + + for (size_t idx = 0; idx < stmnt->mnt_uidmap_num; idx++) { + printf("mnt_uidmap[%zu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + if (stmnt->mask & STATMOUNT_MNT_GIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_gidmap; + + for (size_t idx = 0; idx < stmnt->mnt_gidmap_num; idx++) { + printf("mnt_gidmap[%zu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + printf("\n"); + + free(stmnt); + } + } + + exit(0); +} diff --git a/samples/statx/test-statx.c b/samples/vfs/test-statx.c index d4d77b09412c..424a6fa15723 100644 --- a/samples/statx/test-statx.c +++ b/samples/vfs/test-statx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Test the statx() system call. * * Note that the output of this program is intended to look like the output of @@ -5,11 +6,6 @@ * * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #define _GNU_SOURCE @@ -23,15 +19,31 @@ #include <time.h> #include <sys/syscall.h> #include <sys/types.h> + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + #include <linux/stat.h> #include <linux/fcntl.h> +#define statx foo +#define statx_timestamp foo_timestamp +struct statx; +struct statx_timestamp; #include <sys/stat.h> +#undef statx +#undef statx_timestamp #define AT_STATX_SYNC_TYPE 0x6000 #define AT_STATX_SYNC_AS_STAT 0x0000 #define AT_STATX_FORCE_SYNC 0x2000 #define AT_STATX_DONT_SYNC 0x4000 +#ifndef __NR_statx +#define __NR_statx -1 +#endif + static __attribute__((unused)) ssize_t statx(int dfd, const char *filename, unsigned flags, unsigned int mask, struct statx *buffer) @@ -157,7 +169,8 @@ static void dump_statx(struct statx *stx) "?dai?c??" /* 7- 0 0x00000000-000000ff */ ; - printf("Attributes: %016llx (", stx->stx_attributes); + printf("Attributes: %016llx (", + (unsigned long long)stx->stx_attributes); for (byte = 64 - 8; byte >= 0; byte -= 8) { bits = stx->stx_attributes >> byte; mbits = stx->stx_attributes_mask >> byte; @@ -211,7 +224,7 @@ int main(int argc, char **argv) struct statx stx; int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; - unsigned int mask = STATX_ALL; + unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; for (argv++; *argv; argv++) { if (strcmp(*argv, "-F") == 0) { diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore new file mode 100644 index 000000000000..823b351d3db9 --- /dev/null +++ b/samples/watch_queue/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/watch_test diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile new file mode 100644 index 000000000000..c0db3a6bc524 --- /dev/null +++ b/samples/watch_queue/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += watch_test + +userccflags += -I usr/include diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c new file mode 100644 index 000000000000..24cf7d7a1972 --- /dev/null +++ b/samples/watch_queue/watch_test.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Use watch_queue API to watch for notifications. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <unistd.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <limits.h> + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + +#include <linux/watch_queue.h> +#include <linux/unistd.h> +#include <linux/keyctl.h> + +#ifndef KEYCTL_WATCH_KEY +#define KEYCTL_WATCH_KEY -1 +#endif +#ifndef __NR_keyctl +#define __NR_keyctl -1 +#endif + +#define BUF_SIZE 256 + +static long keyctl_watch_key(int key, int watch_fd, int watch_id) +{ + return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id); +} + +static const char *key_subtypes[256] = { + [NOTIFY_KEY_INSTANTIATED] = "instantiated", + [NOTIFY_KEY_UPDATED] = "updated", + [NOTIFY_KEY_LINKED] = "linked", + [NOTIFY_KEY_UNLINKED] = "unlinked", + [NOTIFY_KEY_CLEARED] = "cleared", + [NOTIFY_KEY_REVOKED] = "revoked", + [NOTIFY_KEY_INVALIDATED] = "invalidated", + [NOTIFY_KEY_SETATTR] = "setattr", +}; + +static void saw_key_change(struct watch_notification *n, size_t len) +{ + struct key_notification *k = (struct key_notification *)n; + + if (len != sizeof(struct key_notification)) { + fprintf(stderr, "Incorrect key message length\n"); + return; + } + + printf("KEY %08x change=%u[%s] aux=%u\n", + k->key_id, n->subtype, key_subtypes[n->subtype], k->aux); +} + +/* + * Consume and display events. + */ +static void consumer(int fd) +{ + unsigned char buffer[433], *p, *end; + union { + struct watch_notification n; + unsigned char buf1[128]; + } n; + ssize_t buf_len; + + for (;;) { + buf_len = read(fd, buffer, sizeof(buffer)); + if (buf_len == -1) { + perror("read"); + exit(1); + } + + if (buf_len == 0) { + printf("-- END --\n"); + return; + } + + if (buf_len > sizeof(buffer)) { + fprintf(stderr, "Read buffer overrun: %zd\n", buf_len); + return; + } + + printf("read() = %zd\n", buf_len); + + p = buffer; + end = buffer + buf_len; + while (p < end) { + size_t largest, len; + + largest = end - p; + if (largest > 128) + largest = 128; + if (largest < sizeof(struct watch_notification)) { + fprintf(stderr, "Short message header: %zu\n", largest); + return; + } + memcpy(&n, p, largest); + + printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n", + p - buffer, n.n.type, n.n.subtype, n.n.info); + + len = n.n.info & WATCH_INFO_LENGTH; + if (len < sizeof(n.n) || len > largest) { + fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest); + exit(1); + } + + switch (n.n.type) { + case WATCH_TYPE_META: + switch (n.n.subtype) { + case WATCH_META_REMOVAL_NOTIFICATION: + printf("REMOVAL of watchpoint %08x\n", + (n.n.info & WATCH_INFO_ID) >> + WATCH_INFO_ID__SHIFT); + break; + case WATCH_META_LOSS_NOTIFICATION: + printf("-- LOSS --\n"); + break; + default: + printf("other meta record\n"); + break; + } + break; + case WATCH_TYPE_KEY_NOTIFY: + saw_key_change(&n.n, len); + break; + default: + printf("other type\n"); + break; + } + + p += len; + } + } +} + +static struct watch_notification_filter filter = { + .nr_filters = 1, + .filters = { + [0] = { + .type = WATCH_TYPE_KEY_NOTIFY, + .subtype_filter[0] = UINT_MAX, + }, + }, +}; + +int main(int argc, char **argv) +{ + int pipefd[2], fd; + + if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) { + perror("pipe2"); + exit(1); + } + fd = pipefd[0]; + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) { + perror("watch_queue(size)"); + exit(1); + } + + if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) { + perror("watch_queue(filter)"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) { + perror("keyctl"); + exit(1); + } + + if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) { + perror("keyctl"); + exit(1); + } + + consumer(fd); + exit(0); +} diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore index ff0ebb540333..a70a0150ed9f 100644 --- a/samples/watchdog/.gitignore +++ b/samples/watchdog/.gitignore @@ -1 +1,2 @@ -watchdog-simple +# SPDX-License-Identifier: GPL-2.0-only +/watchdog-simple diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile index 9b53d89b1ccf..ab39d23dc96b 100644 --- a/samples/watchdog/Makefile +++ b/samples/watchdog/Makefile @@ -1,8 +1,2 @@ -CC := $(CROSS_COMPILE)gcc -PROGS := watchdog-simple - -all: $(PROGS) - -clean: - rm -fr $(PROGS) - +# SPDX-License-Identifier: GPL-2.0 +userprogs-always-y += watchdog-simple diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c index ba45803a2216..9ce66d2ca2a9 100644 --- a/samples/watchdog/watchdog-simple.c +++ b/samples/watchdog/watchdog-simple.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <stdlib.h> #include <unistd.h> |
