summaryrefslogtreecommitdiff
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to 'samples')
-rw-r--r--samples/Kconfig273
-rw-r--r--samples/Makefile47
-rw-r--r--samples/acrn/Makefile12
-rw-r--r--samples/acrn/guest.ld9
-rw-r--r--samples/acrn/payload.ld9
-rw-r--r--samples/acrn/vm-sample.c132
-rw-r--r--samples/auxdisplay/.gitignore2
-rw-r--r--samples/auxdisplay/Makefile2
-rw-r--r--samples/auxdisplay/cfag12864b-example.c267
-rw-r--r--samples/binderfs/.gitignore2
-rw-r--r--samples/binderfs/Makefile4
-rw-r--r--samples/binderfs/binderfs_example.c82
-rw-r--r--samples/bpf/.gitignore51
-rw-r--r--samples/bpf/Makefile392
-rw-r--r--samples/bpf/Makefile.target75
-rw-r--r--samples/bpf/README.rst121
-rw-r--r--samples/bpf/asm_goto_workaround.h28
-rw-r--r--samples/bpf/bpf_insn.h233
-rw-r--r--samples/bpf/cookie_uid_helper_example.c332
-rw-r--r--samples/bpf/cpustat_kern.c280
-rw-r--r--samples/bpf/cpustat_user.c251
-rwxr-xr-xsamples/bpf/do_hbm_test.sh438
-rw-r--r--samples/bpf/fds_example.c195
-rw-r--r--samples/bpf/gnu/stubs.h1
-rw-r--r--samples/bpf/hash_func01.h55
-rw-r--r--samples/bpf/hbm.c515
-rw-r--r--samples/bpf/hbm.h38
-rw-r--r--samples/bpf/hbm_edt_kern.c168
-rw-r--r--samples/bpf/hbm_kern.h215
-rw-r--r--samples/bpf/hbm_out_kern.c179
-rw-r--r--samples/bpf/ibumad_kern.c138
-rw-r--r--samples/bpf/ibumad_user.c158
-rw-r--r--samples/bpf/lathist_kern.c99
-rw-r--r--samples/bpf/lathist_user.c130
-rw-r--r--samples/bpf/lwt_len_hist.bpf.c62
-rwxr-xr-xsamples/bpf/lwt_len_hist.sh40
-rw-r--r--samples/bpf/lwt_len_hist_user.c77
-rw-r--r--samples/bpf/map_perf_test.bpf.c297
-rw-r--r--samples/bpf/map_perf_test_user.c503
-rw-r--r--samples/bpf/net_shared.h34
-rw-r--r--samples/bpf/offwaketime.bpf.c141
-rw-r--r--samples/bpf/offwaketime_user.c153
-rw-r--r--samples/bpf/parse_ldabs.c43
-rw-r--r--samples/bpf/parse_simple.c49
-rw-r--r--samples/bpf/parse_varlen.c150
-rwxr-xr-xsamples/bpf/run_cookie_uid_helper_example.sh15
-rw-r--r--samples/bpf/sampleip_kern.c38
-rw-r--r--samples/bpf/sampleip_user.c234
-rw-r--r--samples/bpf/sock_example.c111
-rw-r--r--samples/bpf/sock_example.h35
-rw-r--r--samples/bpf/sockex1_kern.c30
-rw-r--r--samples/bpf/sockex1_user.c63
-rw-r--r--samples/bpf/sockex2_kern.c222
-rw-r--r--samples/bpf/sockex2_user.c62
-rw-r--r--samples/bpf/sockex3_kern.c304
-rw-r--r--samples/bpf/sockex3_user.c100
-rw-r--r--samples/bpf/spintest.bpf.c60
-rw-r--r--samples/bpf/spintest_user.c84
-rw-r--r--samples/bpf/syscall_nrs.c24
-rw-r--r--samples/bpf/syscall_tp_kern.c102
-rw-r--r--samples/bpf/syscall_tp_user.c155
-rw-r--r--samples/bpf/task_fd_query_kern.c19
-rw-r--r--samples/bpf/task_fd_query_user.c423
-rwxr-xr-xsamples/bpf/tc_l2_redirect.sh177
-rw-r--r--samples/bpf/tc_l2_redirect_kern.c231
-rw-r--r--samples/bpf/tc_l2_redirect_user.c70
-rw-r--r--samples/bpf/tcbpf1_kern.c91
-rw-r--r--samples/bpf/tcp_basertt_kern.c71
-rw-r--r--samples/bpf/tcp_bpf.readme28
-rw-r--r--samples/bpf/tcp_bufs_kern.c81
-rw-r--r--samples/bpf/tcp_clamp_kern.c97
-rw-r--r--samples/bpf/tcp_cong_kern.c78
-rw-r--r--samples/bpf/tcp_dumpstats_kern.c68
-rw-r--r--samples/bpf/tcp_iw_kern.c83
-rw-r--r--samples/bpf/tcp_rwnd_kern.c64
-rw-r--r--samples/bpf/tcp_synrto_kern.c64
-rw-r--r--samples/bpf/tcp_tos_reflect_kern.c80
-rwxr-xr-xsamples/bpf/test_cls_bpf.sh38
-rw-r--r--samples/bpf/test_lru_dist.c534
-rw-r--r--samples/bpf/test_lwt_bpf.c245
-rwxr-xr-xsamples/bpf/test_lwt_bpf.sh405
-rw-r--r--samples/bpf/test_map_in_map.bpf.c172
-rw-r--r--samples/bpf/test_map_in_map_user.c168
-rw-r--r--samples/bpf/trace_event_kern.c79
-rw-r--r--samples/bpf/trace_event_user.c352
-rw-r--r--samples/bpf/trace_output.bpf.c29
-rw-r--r--samples/bpf/trace_output_user.c105
-rw-r--r--samples/bpf/tracex1.bpf.c47
-rw-r--r--samples/bpf/tracex1_user.c50
-rw-r--r--samples/bpf/tracex3.bpf.c100
-rw-r--r--samples/bpf/tracex3_user.c183
-rw-r--r--samples/bpf/tracex4.bpf.c54
-rw-r--r--samples/bpf/tracex4_user.c96
-rw-r--r--samples/bpf/tracex5.bpf.c93
-rw-r--r--samples/bpf/tracex5_user.c98
-rw-r--r--samples/bpf/tracex6.bpf.c81
-rw-r--r--samples/bpf/tracex6_user.c222
-rwxr-xr-xsamples/bpf/xdp2skb_meta.sh220
-rw-r--r--samples/bpf/xdp2skb_meta_kern.c104
-rw-r--r--samples/bpf/xdp_adjust_tail_kern.c156
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c198
-rw-r--r--samples/bpf/xdp_fwd_kern.c158
-rw-r--r--samples/bpf/xdp_fwd_user.c226
-rw-r--r--samples/bpf/xdp_router_ipv4.bpf.c189
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c699
-rw-r--r--samples/bpf/xdp_sample.bpf.c266
-rw-r--r--samples/bpf/xdp_sample.bpf.h121
-rw-r--r--samples/bpf/xdp_sample_shared.h17
-rw-r--r--samples/bpf/xdp_sample_user.c1673
-rw-r--r--samples/bpf/xdp_sample_user.h110
-rw-r--r--samples/bpf/xdp_tx_iptunnel_common.h34
-rw-r--r--samples/bpf/xdp_tx_iptunnel_kern.c237
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c310
-rw-r--r--samples/cgroup/.gitignore3
-rw-r--r--samples/cgroup/Makefile5
-rw-r--r--samples/cgroup/cgroup_event_listener.c83
-rw-r--r--samples/cgroup/memcg_event_listener.c328
-rw-r--r--samples/check-exec/.gitignore2
-rw-r--r--samples/check-exec/Makefile15
-rw-r--r--samples/check-exec/inc.c212
-rwxr-xr-xsamples/check-exec/run-script-ask.sh9
-rwxr-xr-xsamples/check-exec/script-ask.inc5
-rwxr-xr-xsamples/check-exec/script-exec.inc4
-rw-r--r--samples/check-exec/script-noexec.inc4
-rw-r--r--samples/check-exec/set-exec.c85
-rw-r--r--samples/configfs/Makefile3
-rw-r--r--samples/configfs/configfs_sample.c368
-rw-r--r--samples/connector/.gitignore2
-rw-r--r--samples/connector/Makefile6
-rw-r--r--samples/connector/cn_test.c188
-rw-r--r--samples/connector/ucon.c236
-rw-r--r--samples/coresight/Makefile4
-rw-r--r--samples/coresight/coresight-cfg-sample.c73
-rw-r--r--samples/damon/Kconfig43
-rw-r--r--samples/damon/Makefile5
-rw-r--r--samples/damon/mtier.c240
-rw-r--r--samples/damon/prcl.c169
-rw-r--r--samples/damon/wsse.c149
-rw-r--r--samples/fanotify/.gitignore1
-rw-r--r--samples/fanotify/Makefile5
-rw-r--r--samples/fanotify/fs-monitor.c149
-rw-r--r--samples/fprobe/Makefile3
-rw-r--r--samples/fprobe/fprobe_example.c154
-rw-r--r--samples/ftrace/Makefile11
-rw-r--r--samples/ftrace/ftrace-direct-modify.c339
-rw-r--r--samples/ftrace/ftrace-direct-multi-modify.c383
-rw-r--r--samples/ftrace/ftrace-direct-multi.c241
-rw-r--r--samples/ftrace/ftrace-direct-too.c256
-rw-r--r--samples/ftrace/ftrace-direct.c223
-rw-r--r--samples/ftrace/ftrace-ops.c252
-rw-r--r--samples/ftrace/sample-trace-array.c143
-rw-r--r--samples/ftrace/sample-trace-array.h84
-rw-r--r--samples/hid/.gitignore8
-rw-r--r--samples/hid/Makefile250
-rw-r--r--samples/hid/Makefile.target75
-rw-r--r--samples/hid/hid_bpf_helpers.h21
-rw-r--r--samples/hid/hid_mouse.bpf.c128
-rw-r--r--samples/hid/hid_mouse.c138
-rw-r--r--samples/hid/hid_surface_dial.bpf.c140
-rw-r--r--samples/hid/hid_surface_dial.c203
-rw-r--r--samples/hidraw/.gitignore2
-rw-r--r--samples/hidraw/Makefile12
-rw-r--r--samples/hidraw/hid-example.c11
-rw-r--r--samples/hung_task/Makefile2
-rw-r--r--samples/hung_task/hung_task_tests.c164
-rw-r--r--samples/hw_breakpoint/Makefile1
-rw-r--r--samples/hw_breakpoint/data_breakpoint.c34
-rw-r--r--samples/kdb/Makefile1
-rw-r--r--samples/kdb/kdb_hello.c20
-rw-r--r--samples/kfifo/Makefile1
-rw-r--r--samples/kfifo/bytestream-example.c40
-rw-r--r--samples/kfifo/dma-example.c19
-rw-r--r--samples/kfifo/inttype-example.c40
-rw-r--r--samples/kfifo/record-example.c36
-rw-r--r--samples/kmemleak/Makefile3
-rw-r--r--samples/kmemleak/kmemleak-test.c102
-rw-r--r--samples/kobject/Makefile1
-rw-r--r--samples/kobject/kobject-example.c36
-rw-r--r--samples/kobject/kset-example.c84
-rw-r--r--samples/kprobes/Makefile3
-rw-r--r--samples/kprobes/jprobe_example.c68
-rw-r--r--samples/kprobes/kprobe_example.c107
-rw-r--r--samples/kprobes/kretprobe_example.c31
-rw-r--r--samples/landlock/.gitignore1
-rw-r--r--samples/landlock/Makefile13
-rw-r--r--samples/landlock/sandboxer.c539
-rw-r--r--samples/livepatch/Makefile8
-rw-r--r--samples/livepatch/livepatch-callbacks-busymod.c60
-rw-r--r--samples/livepatch/livepatch-callbacks-demo.c197
-rw-r--r--samples/livepatch/livepatch-callbacks-mod.c42
-rw-r--r--samples/livepatch/livepatch-sample.c71
-rw-r--r--samples/livepatch/livepatch-shadow-fix1.c173
-rw-r--r--samples/livepatch/livepatch-shadow-fix2.c133
-rw-r--r--samples/livepatch/livepatch-shadow-mod.c212
-rw-r--r--samples/mei/.gitignore2
-rw-r--r--samples/mei/Makefile5
-rw-r--r--samples/mei/mei-amt-version.c488
-rw-r--r--samples/nitro_enclaves/.gitignore2
-rw-r--r--samples/nitro_enclaves/Makefile16
-rw-r--r--samples/nitro_enclaves/ne_ioctl_sample.c882
-rw-r--r--samples/pfsm/.gitignore2
-rw-r--r--samples/pfsm/Makefile4
-rw-r--r--samples/pfsm/pfsm-wakeup.c125
-rw-r--r--samples/pidfd/.gitignore2
-rw-r--r--samples/pidfd/Makefile4
-rw-r--r--samples/pidfd/pidfd-metadata.c120
-rw-r--r--samples/pktgen/README.rst64
-rw-r--r--samples/pktgen/functions.sh340
-rw-r--r--samples/pktgen/parameters.sh139
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh113
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh93
-rwxr-xr-xsamples/pktgen/pktgen_sample01_simple.sh104
-rwxr-xr-xsamples/pktgen/pktgen_sample02_multiqueue.sh110
-rwxr-xr-xsamples/pktgen/pktgen_sample03_burst_single_flow.sh110
-rwxr-xr-xsamples/pktgen/pktgen_sample04_many_flows.sh122
-rwxr-xr-xsamples/pktgen/pktgen_sample05_flow_per_thread.sh106
-rwxr-xr-xsamples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh128
-rw-r--r--samples/qmi/Makefile2
-rw-r--r--samples/qmi/qmi_sample_client.c620
-rw-r--r--samples/rpmsg/Makefile1
-rw-r--r--samples/rpmsg/rpmsg_client_sample.c64
-rw-r--r--samples/rust/Kconfig171
-rw-r--r--samples/rust/Makefile21
-rw-r--r--samples/rust/hostprogs/.gitignore3
-rw-r--r--samples/rust/hostprogs/Makefile5
-rw-r--r--samples/rust/hostprogs/a.rs7
-rw-r--r--samples/rust/hostprogs/b.rs5
-rw-r--r--samples/rust/hostprogs/single.rs12
-rw-r--r--samples/rust/rust_configfs.rs192
-rw-r--r--samples/rust/rust_debugfs.rs163
-rw-r--r--samples/rust/rust_debugfs_scoped.rs140
-rw-r--r--samples/rust/rust_dma.rs121
-rw-r--r--samples/rust/rust_driver_auxiliary.rs128
-rw-r--r--samples/rust/rust_driver_faux.rs29
-rw-r--r--samples/rust/rust_driver_i2c.rs74
-rw-r--r--samples/rust/rust_driver_pci.rs119
-rw-r--r--samples/rust/rust_driver_platform.rs191
-rw-r--r--samples/rust/rust_driver_usb.rs46
-rw-r--r--samples/rust/rust_i2c_client.rs147
-rw-r--r--samples/rust/rust_minimal.rs48
-rw-r--r--samples/rust/rust_misc_device.rs272
-rw-r--r--samples/rust/rust_print_events.c8
-rw-r--r--samples/rust/rust_print_main.rs117
-rw-r--r--samples/seccomp/.gitignore8
-rw-r--r--samples/seccomp/Makefile42
-rw-r--r--samples/seccomp/bpf-direct.c1
-rw-r--r--samples/seccomp/bpf-fancy.c5
-rw-r--r--samples/seccomp/bpf-helper.c45
-rw-r--r--samples/seccomp/bpf-helper.h134
-rw-r--r--samples/seccomp/dropper.c17
-rw-r--r--samples/seccomp/user-trap.c379
-rw-r--r--samples/timers/.gitignore2
-rw-r--r--samples/timers/Makefile4
-rw-r--r--samples/timers/hpet_example.c295
-rw-r--r--samples/trace_events/Makefile3
-rw-r--r--samples/trace_events/trace-events-sample.c105
-rw-r--r--samples/trace_events/trace-events-sample.h535
-rw-r--r--samples/trace_events/trace_custom_sched.c59
-rw-r--r--samples/trace_events/trace_custom_sched.h96
-rw-r--r--samples/trace_printk/Makefile7
-rw-r--r--samples/trace_printk/trace-printk.c58
-rw-r--r--samples/tsm-mr/Makefile2
-rw-r--r--samples/tsm-mr/tsm_mr_sample.c131
-rw-r--r--samples/uhid/.gitignore2
-rw-r--r--samples/uhid/Makefile12
-rw-r--r--samples/uhid/uhid-example.c128
-rw-r--r--samples/user_events/Makefile5
-rw-r--r--samples/user_events/example.c73
-rw-r--r--samples/v4l/Makefile2
-rw-r--r--samples/v4l/v4l2-pci-skeleton.c900
-rw-r--r--samples/vfio-mdev/Makefile5
-rw-r--r--samples/vfio-mdev/README.rst100
-rw-r--r--samples/vfio-mdev/mbochs.c1451
-rw-r--r--samples/vfio-mdev/mdpy-defs.h22
-rw-r--r--samples/vfio-mdev/mdpy-fb.c233
-rw-r--r--samples/vfio-mdev/mdpy.c743
-rw-r--r--samples/vfio-mdev/mtty.c2040
-rw-r--r--samples/vfs/.gitignore5
-rw-r--r--samples/vfs/Makefile5
-rw-r--r--samples/vfs/mountinfo.c274
-rw-r--r--samples/vfs/samples-vfs.h253
-rw-r--r--samples/vfs/test-fsmount.c129
-rw-r--r--samples/vfs/test-list-all-mounts.c173
-rw-r--r--samples/vfs/test-statx.c271
-rw-r--r--samples/watch_queue/.gitignore2
-rw-r--r--samples/watch_queue/Makefile4
-rw-r--r--samples/watch_queue/watch_test.c192
-rw-r--r--samples/watchdog/.gitignore2
-rw-r--r--samples/watchdog/Makefile2
-rw-r--r--samples/watchdog/watchdog-simple.c25
290 files changed, 39714 insertions, 484 deletions
diff --git a/samples/Kconfig b/samples/Kconfig
index 6181c2cc9ca0..5bc7c9e5a59e 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menuconfig SAMPLES
bool "Sample kernel code"
help
@@ -5,15 +6,62 @@ menuconfig SAMPLES
if SAMPLES
+config SAMPLE_AUXDISPLAY
+ bool "auxdisplay sample"
+ depends on CC_CAN_LINK
+
config SAMPLE_TRACE_EVENTS
tristate "Build trace_events examples -- loadable modules only"
depends on EVENT_TRACING && m
help
- This build trace event example modules.
+ This builds the trace event example module.
+
+config SAMPLE_TRACE_CUSTOM_EVENTS
+ tristate "Build custom trace event example -- loadable modules only"
+ depends on EVENT_TRACING && m
+ help
+ This builds the custom trace event example module.
+
+config SAMPLE_TRACE_PRINTK
+ tristate "Build trace_printk module - tests various trace_printk formats"
+ depends on EVENT_TRACING && m
+ help
+ This builds a module that calls trace_printk() and can be used to
+ test various trace_printk() calls from a module.
+
+config SAMPLE_FTRACE_DIRECT
+ tristate "Build register_ftrace_direct() example"
+ depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m
+ depends on HAVE_SAMPLE_FTRACE_DIRECT
+ help
+ This builds an ftrace direct function example
+ that hooks to wake_up_process and prints the parameters.
+
+config SAMPLE_FTRACE_DIRECT_MULTI
+ tristate "Build register_ftrace_direct() on multiple ips example"
+ depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m
+ depends on HAVE_SAMPLE_FTRACE_DIRECT_MULTI
+ help
+ This builds an ftrace direct function example
+ that hooks to wake_up_process and schedule, and prints
+ the function addresses.
+
+config SAMPLE_FTRACE_OPS
+ tristate "Build custom ftrace ops example"
+ depends on FUNCTION_TRACER
+ help
+ This builds an ftrace ops example that hooks two functions and
+ measures the time taken to invoke one function a number of times.
+
+config SAMPLE_TRACE_ARRAY
+ tristate "Build sample module for kernel access to Ftrace instances"
+ depends on EVENT_TRACING && m
+ help
+ This builds a module that demonstrates the use of various APIs to
+ access Ftrace instances from within the kernel.
config SAMPLE_KOBJECT
- tristate "Build kobject examples -- loadable modules only"
- depends on m
+ tristate "Build kobject examples"
help
This config option will allow you to build a number of
different kobject sample modules showing how to use kobjects,
@@ -38,6 +86,13 @@ config SAMPLE_HW_BREAKPOINT
help
This builds kernel hardware breakpoint example modules.
+config SAMPLE_FPROBE
+ tristate "Build fprobe examples -- loadable modules only"
+ depends on FPROBE && m
+ help
+ This builds a fprobe example module. This module has an option 'symbol'.
+ You can specify a probed symbol or symbols separated with ','.
+
config SAMPLE_KFIFO
tristate "Build kfifo examples -- loadable modules only"
depends on m
@@ -55,6 +110,16 @@ config SAMPLE_KDB
Build an example of how to dynamically add the hello
command to the kdb shell.
+config SAMPLE_QMI_CLIENT
+ tristate "Build qmi client sample -- loadable modules only"
+ depends on m
+ depends on ARCH_QCOM
+ depends on NET
+ select QCOM_QMI_HELPERS
+ help
+ Build an QMI client sample driver, which demonstrates how to
+ communicate with a remote QRTR service, using QMI encoded messages.
+
config SAMPLE_RPMSG_CLIENT
tristate "Build rpmsg client sample -- loadable modules only"
depends on RPMSG && m
@@ -63,4 +128,206 @@ config SAMPLE_RPMSG_CLIENT
to communicate with an AMP-configured remote processor over
the rpmsg bus.
+config SAMPLE_LIVEPATCH
+ tristate "Build live patching samples -- loadable modules only"
+ depends on LIVEPATCH && m
+ help
+ Build sample live patch demonstrations.
+
+config SAMPLE_CONFIGFS
+ tristate "Build configfs patching sample -- loadable modules only"
+ depends on CONFIGFS_FS && m
+ help
+ Builds a sample configfs interface.
+
+config SAMPLE_CONNECTOR
+ tristate "Build connector sample -- loadable modules only"
+ depends on CONNECTOR && HEADERS_INSTALL && m
+ help
+ When enabled, this builds both a sample kernel module for
+ the connector interface and a user space tool to communicate
+ with it.
+ See also Documentation/driver-api/connector.rst
+
+config SAMPLE_FANOTIFY_ERROR
+ bool "Build fanotify error monitoring sample"
+ depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL
+ help
+ When enabled, this builds an example code that uses the
+ FAN_FS_ERROR fanotify mechanism to monitor filesystem
+ errors.
+ See also Documentation/admin-guide/filesystem-monitoring.rst.
+
+config SAMPLE_HIDRAW
+ bool "hidraw sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+
+config SAMPLE_LANDLOCK
+ bool "Landlock example"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build a simple Landlock sandbox manager able to start a process
+ restricted by a user-defined filesystem access control policy.
+
+config SAMPLE_PIDFD
+ bool "pidfd sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+
+config SAMPLE_SECCOMP
+ bool "Build seccomp sample code"
+ depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build samples of seccomp filters using various methods of
+ BPF filter construction.
+
+config SAMPLE_TIMER
+ bool "Timer sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+
+config SAMPLE_TSM_MR
+ tristate "TSM measurement sample"
+ select TSM_MEASUREMENTS
+ select VIRT_DRIVERS
+ help
+ Build a sample module that emulates MRs (Measurement Registers) and
+ exposes them to user mode applications through the TSM sysfs
+ interface (/sys/class/misc/tsm_mr_sample/emulated_mr/).
+
+ The module name will be tsm-mr-sample when built as a module.
+
+config SAMPLE_UHID
+ bool "UHID sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build UHID sample program.
+
+config SAMPLE_VFIO_MDEV_MTTY
+ tristate "Build VFIO mtty example mediated device sample code"
+ depends on VFIO
+ select VFIO_MDEV
+ help
+ Build a virtual tty sample driver for use as a VFIO
+ mediated device
+
+config SAMPLE_VFIO_MDEV_MDPY
+ tristate "Build VFIO mdpy example mediated device sample code"
+ depends on VFIO
+ select VFIO_MDEV
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It is a simple framebuffer and supports
+ the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
+
+config SAMPLE_VFIO_MDEV_MDPY_FB
+ tristate "Build VFIO mdpy example guest fbdev driver"
+ depends on FB
+ select FB_IOMEM_HELPERS
+ help
+ Guest fbdev driver for the virtual display sample driver.
+
+config SAMPLE_VFIO_MDEV_MBOCHS
+ tristate "Build VFIO mbochs example mediated device sample code"
+ depends on VFIO
+ select VFIO_MDEV
+ select DMA_SHARED_BUFFER
+ help
+ Build a virtual display sample driver for use as a VFIO
+ mediated device. It supports the region display interface
+ (VFIO_GFX_PLANE_TYPE_DMABUF).
+ Emulate enough of qemu stdvga to make bochs-drm.ko happy.
+ That is basically the vram memory bar and the bochs dispi
+ interface vbe registers in the mmio register bar.
+ Specifically it does *not* include any legacy vga stuff.
+ Device looks a lot like "qemu -device secondary-vga".
+
+config SAMPLE_ANDROID_BINDERFS
+ bool "Build Android binderfs example"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Builds a sample program to illustrate the use of the Android binderfs
+ filesystem.
+
+config SAMPLE_VFS
+ bool "Build example programs that use new VFS system calls"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build example userspace programs that use new VFS system calls such
+ as mount API and statx(). Note that this is restricted to the x86
+ arch whilst it accesses system calls that aren't yet in all arches.
+
+config SAMPLE_INTEL_MEI
+ bool "Build example program working with intel mei driver"
+ depends on INTEL_MEI
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build a sample program to work with mei device.
+
+config SAMPLE_TPS6594_PFSM
+ bool "Build example program working with TPS6594 PFSM driver"
+ depends on HEADERS_INSTALL
+ depends on CC_CAN_LINK
+ help
+ Build a sample program to work with PFSM devices.
+
+config SAMPLE_WATCHDOG
+ bool "watchdog sample"
+ depends on CC_CAN_LINK
+
+config SAMPLE_WATCH_QUEUE
+ bool "Build example watch_queue notification API consumer"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build example userspace program to use the new mount_notify(),
+ sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
+
+config SAMPLE_CORESIGHT_SYSCFG
+ tristate "Build example loadable module for CoreSight config"
+ depends on CORESIGHT && m
+ help
+ Build an example loadable module that adds new CoreSight features
+ and configuration using the CoreSight system configuration API.
+ This demonstrates how a user may create their own CoreSight
+ configurations and easily load them into the system at runtime.
+
+config SAMPLE_KMEMLEAK
+ tristate "Simple test for the kernel memory leak detector"
+ depends on DEBUG_KMEMLEAK && m
+ help
+ Build a sample program which have explicitly leaks memory to test
+ kmemleak.
+
+config SAMPLE_CGROUP
+ bool "Build cgroup sample code"
+ depends on CGROUPS && CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build samples that demonstrate the usage of the cgroup API.
+
+config SAMPLE_CHECK_EXEC
+ bool "Exec secure bits examples"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build a tool to easily configure SECBIT_EXEC_RESTRICT_FILE and
+ SECBIT_EXEC_DENY_INTERACTIVE, and a simple script interpreter to
+ demonstrate how they should be used with execveat(2) +
+ AT_EXECVE_CHECK.
+
+config SAMPLE_HUNG_TASK
+ tristate "Hung task detector test code"
+ depends on DETECT_HUNG_TASK && DEBUG_FS
+ help
+ Build a module that provides debugfs files (e.g., mutex, semaphore,
+ rw_semaphore_read, rw_semaphore_write) under <debugfs>/hung_task.
+ Reading these files with multiple processes triggers hung task
+ detection by holding locks for a long time (256 seconds).
+
+source "samples/rust/Kconfig"
+
+source "samples/damon/Kconfig"
+
endif # SAMPLES
+
+config HAVE_SAMPLE_FTRACE_DIRECT
+ bool
+
+config HAVE_SAMPLE_FTRACE_DIRECT_MULTI
+ bool
diff --git a/samples/Makefile b/samples/Makefile
index 1a60c62e2045..07641e177bd8 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,4 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0
# Makefile for Linux samples code
-obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ \
- hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/
+subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay
+subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs
+subdir-$(CONFIG_SAMPLE_CHECK_EXEC) += check-exec
+subdir-$(CONFIG_SAMPLE_CGROUP) += cgroup
+obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/
+obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/
+obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/
+subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/
+obj-$(CONFIG_SAMPLE_KDB) += kdb/
+obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/
+obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/
+obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/
+subdir-$(CONFIG_SAMPLE_LANDLOCK) += landlock
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/
+subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd
+obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/
+obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/
+subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp
+subdir-$(CONFIG_SAMPLE_TIMER) += timers
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/
+obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/
+obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/
+obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace/
+obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/
+subdir-$(CONFIG_SAMPLE_UHID) += uhid
+obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/
+obj-y += vfio-mdev/
+subdir-$(CONFIG_SAMPLE_VFS) += vfs
+obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
+obj-$(CONFIG_SAMPLE_TPS6594_PFSM) += pfsm/
+subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
+subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
+obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak/
+obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/
+obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/
+obj-$(CONFIG_SAMPLES_RUST) += rust/
+obj-$(CONFIG_SAMPLE_DAMON_WSSE) += damon/
+obj-$(CONFIG_SAMPLE_DAMON_PRCL) += damon/
+obj-$(CONFIG_SAMPLE_DAMON_MTIER) += damon/
+obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task/
+obj-$(CONFIG_SAMPLE_TSM_MR) += tsm-mr/
diff --git a/samples/acrn/Makefile b/samples/acrn/Makefile
new file mode 100644
index 000000000000..c8e3ed9785e9
--- /dev/null
+++ b/samples/acrn/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+.PHONY: vm-sample
+
+vm-sample: vm-sample.o payload.o
+ $(CC) $^ -o $@
+
+payload.o: payload.ld guest16.o
+ $(LD) -T $< -o $@
+
+clean:
+ rm *.o vm-sample
diff --git a/samples/acrn/guest.ld b/samples/acrn/guest.ld
new file mode 100644
index 000000000000..5127c682bd22
--- /dev/null
+++ b/samples/acrn/guest.ld
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+OUTPUT_FORMAT(binary)
+SECTIONS
+{
+ .start : { *(.start) }
+ .text : { *(.text*) }
+ .rodata : { *(.rodata) }
+ .data : { *(.data) }
+}
diff --git a/samples/acrn/payload.ld b/samples/acrn/payload.ld
new file mode 100644
index 000000000000..e8d9a498ad62
--- /dev/null
+++ b/samples/acrn/payload.ld
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+SECTIONS
+{
+ .payload16 0 : {
+ guest16 = .;
+ guest16.o(.text)
+ guest16_end = .;
+ }
+}
diff --git a/samples/acrn/vm-sample.c b/samples/acrn/vm-sample.c
new file mode 100644
index 000000000000..c61e0f91456e
--- /dev/null
+++ b/samples/acrn/vm-sample.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A sample program to run a User VM on the ACRN hypervisor
+ *
+ * This sample runs in a Service VM, which is a privileged VM of ACRN.
+ * CONFIG_ACRN_HSM needs to be enabled in the Service VM.
+ *
+ * Guest VM code in guest16.s will be executed after the VM launched.
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/acrn.h>
+
+#define GUEST_MEMORY_SIZE (1024*1024)
+void *guest_memory;
+
+extern const unsigned char guest16[], guest16_end[];
+static char io_request_page[4096] __attribute__((aligned(4096)));
+static struct acrn_io_request *io_req_buf = (struct acrn_io_request *)io_request_page;
+
+__u16 vcpu_num;
+__u16 vmid;
+
+int hsm_fd;
+int is_running = 1;
+
+void vm_exit(int sig)
+{
+ sig = sig;
+
+ is_running = 0;
+ ioctl(hsm_fd, ACRN_IOCTL_PAUSE_VM, vmid);
+ ioctl(hsm_fd, ACRN_IOCTL_DESTROY_IOREQ_CLIENT, 0);
+}
+
+int main(int argc, char **argv)
+{
+ int vcpu_id, ret;
+ struct acrn_vm_creation create_vm = {0};
+ struct acrn_vm_memmap ram_map = {0};
+ struct acrn_vcpu_regs regs;
+ struct acrn_io_request *io_req;
+ struct acrn_ioreq_notify __attribute__((aligned(8))) notify;
+
+ argc = argc;
+ argv = argv;
+
+ ret = posix_memalign(&guest_memory, 4096, GUEST_MEMORY_SIZE);
+ if (ret < 0) {
+ printf("Not enough memory!\n");
+ return -1;
+ }
+ hsm_fd = open("/dev/acrn_hsm", O_RDWR|O_CLOEXEC);
+
+ create_vm.ioreq_buf = (__u64)io_req_buf;
+ ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_VM, &create_vm);
+ printf("Created VM! [%d]\n", ret);
+ vcpu_num = create_vm.vcpu_num;
+ vmid = create_vm.vmid;
+
+ /* setup guest memory */
+ ram_map.type = ACRN_MEMMAP_RAM;
+ ram_map.vma_base = (__u64)guest_memory;
+ ram_map.len = GUEST_MEMORY_SIZE;
+ ram_map.user_vm_pa = 0;
+ ram_map.attr = ACRN_MEM_ACCESS_RWX;
+ ret = ioctl(hsm_fd, ACRN_IOCTL_SET_MEMSEG, &ram_map);
+ printf("Set up VM memory! [%d]\n", ret);
+
+ memcpy(guest_memory, guest16, guest16_end-guest16);
+
+ /* setup vcpu registers */
+ memset(&regs, 0, sizeof(regs));
+ regs.vcpu_id = 0;
+ regs.vcpu_regs.rip = 0;
+
+ /* CR0_ET | CR0_NE */
+ regs.vcpu_regs.cr0 = 0x30U;
+ regs.vcpu_regs.cs_ar = 0x009FU;
+ regs.vcpu_regs.cs_sel = 0xF000U;
+ regs.vcpu_regs.cs_limit = 0xFFFFU;
+ regs.vcpu_regs.cs_base = 0 & 0xFFFF0000UL;
+ regs.vcpu_regs.rip = 0 & 0xFFFFUL;
+
+ ret = ioctl(hsm_fd, ACRN_IOCTL_SET_VCPU_REGS, &regs);
+ printf("Set up VM BSP registers! [%d]\n", ret);
+
+ /* create an ioreq client for this VM */
+ ret = ioctl(hsm_fd, ACRN_IOCTL_CREATE_IOREQ_CLIENT, 0);
+ printf("Created IO request client! [%d]\n", ret);
+
+ /* run vm */
+ ret = ioctl(hsm_fd, ACRN_IOCTL_START_VM, vmid);
+ printf("Start VM! [%d]\n", ret);
+
+ signal(SIGINT, vm_exit);
+ while (is_running) {
+ ret = ioctl(hsm_fd, ACRN_IOCTL_ATTACH_IOREQ_CLIENT, 0);
+
+ for (vcpu_id = 0; vcpu_id < vcpu_num; vcpu_id++) {
+ io_req = &io_req_buf[vcpu_id];
+ if ((__sync_add_and_fetch(&io_req->processed, 0) == ACRN_IOREQ_STATE_PROCESSING)
+ && (!io_req->kernel_handled))
+ if (io_req->type == ACRN_IOREQ_TYPE_PORTIO) {
+ int bytes, port, in;
+
+ port = io_req->reqs.pio_request.address;
+ bytes = io_req->reqs.pio_request.size;
+ in = (io_req->reqs.pio_request.direction == ACRN_IOREQ_DIR_READ);
+ printf("Guest VM %s PIO[%x] with size[%x]\n", in ? "read" : "write", port, bytes);
+
+ notify.vmid = vmid;
+ notify.vcpu = vcpu_id;
+ ioctl(hsm_fd, ACRN_IOCTL_NOTIFY_REQUEST_FINISH, &notify);
+ }
+ }
+ }
+
+ ret = ioctl(hsm_fd, ACRN_IOCTL_DESTROY_VM, NULL);
+ printf("Destroy VM! [%d]\n", ret);
+ close(hsm_fd);
+ free(guest_memory);
+ return 0;
+}
diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore
new file mode 100644
index 000000000000..d023816849bd
--- /dev/null
+++ b/samples/auxdisplay/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/cfag12864b-example
diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile
new file mode 100644
index 000000000000..19d5568938c3
--- /dev/null
+++ b/samples/auxdisplay/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += cfag12864b-example
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
new file mode 100644
index 000000000000..2e3bb7375c99
--- /dev/null
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Filename: cfag12864b-example.c
+ * Version: 0.1.0
+ * Description: cfag12864b LCD userspace example program
+ *
+ * Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
+ * Date: 2006-10-31
+ */
+
+/*
+ * ------------------------
+ * start of cfag12864b code
+ * ------------------------
+ */
+
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#define CFAG12864B_WIDTH (128)
+#define CFAG12864B_HEIGHT (64)
+#define CFAG12864B_SIZE (128 * 64 / 8)
+#define CFAG12864B_BPB (8)
+#define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \
+ CFAG12864B_BPB + (x) / CFAG12864B_BPB)
+#define CFAG12864B_BIT(n) (((unsigned char) 1) << (n))
+
+#undef CFAG12864B_DOCHECK
+#ifdef CFAG12864B_DOCHECK
+ #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \
+ (y) < CFAG12864B_HEIGHT)
+#else
+ #define CFAG12864B_CHECK(x, y) (1)
+#endif
+
+int cfag12864b_fd;
+unsigned char * cfag12864b_mem;
+unsigned char cfag12864b_buffer[CFAG12864B_SIZE];
+
+/*
+ * init a cfag12864b framebuffer device
+ *
+ * No error: return = 0
+ * Unable to open: return = -1
+ * Unable to mmap: return = -2
+ */
+static int cfag12864b_init(char *path)
+{
+ cfag12864b_fd = open(path, O_RDWR);
+ if (cfag12864b_fd == -1)
+ return -1;
+
+ cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, cfag12864b_fd, 0);
+ if (cfag12864b_mem == MAP_FAILED) {
+ close(cfag12864b_fd);
+ return -2;
+ }
+
+ return 0;
+}
+
+/*
+ * exit a cfag12864b framebuffer device
+ */
+static void cfag12864b_exit(void)
+{
+ munmap(cfag12864b_mem, CFAG12864B_SIZE);
+ close(cfag12864b_fd);
+}
+
+/*
+ * set (x, y) pixel
+ */
+static void cfag12864b_set(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |=
+ CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * unset (x, y) pixel
+ */
+static void cfag12864b_unset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &=
+ ~CFAG12864B_BIT(x % CFAG12864B_BPB);
+}
+
+/*
+ * is set (x, y) pixel?
+ *
+ * Pixel off: return = 0
+ * Pixel on: return = 1
+ */
+static unsigned char cfag12864b_isset(unsigned char x, unsigned char y)
+{
+ if (CFAG12864B_CHECK(x, y))
+ if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &
+ CFAG12864B_BIT(x % CFAG12864B_BPB))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * not (x, y) pixel
+ */
+static void cfag12864b_not(unsigned char x, unsigned char y)
+{
+ if (cfag12864b_isset(x, y))
+ cfag12864b_unset(x, y);
+ else
+ cfag12864b_set(x, y);
+}
+
+/*
+ * fill (set all pixels)
+ */
+static void cfag12864b_fill(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0xFF;
+}
+
+/*
+ * clear (unset all pixels)
+ */
+static void cfag12864b_clear(void)
+{
+ unsigned short i;
+
+ for (i = 0; i < CFAG12864B_SIZE; i++)
+ cfag12864b_buffer[i] = 0;
+}
+
+/*
+ * format a [128*64] matrix
+ *
+ * Pixel off: src[i] = 0
+ * Pixel on: src[i] > 0
+ */
+static void cfag12864b_format(unsigned char * matrix)
+{
+ unsigned char i, j, n;
+
+ for (i = 0; i < CFAG12864B_HEIGHT; i++)
+ for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) {
+ cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB +
+ j] = 0;
+ for (n = 0; n < CFAG12864B_BPB; n++)
+ if (matrix[i * CFAG12864B_WIDTH +
+ j * CFAG12864B_BPB + n])
+ cfag12864b_buffer[i * CFAG12864B_WIDTH /
+ CFAG12864B_BPB + j] |=
+ CFAG12864B_BIT(n);
+ }
+}
+
+/*
+ * blit buffer to lcd
+ */
+static void cfag12864b_blit(void)
+{
+ memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE);
+}
+
+/*
+ * ----------------------
+ * end of cfag12864b code
+ * ----------------------
+ */
+
+#include <stdio.h>
+
+#define EXAMPLES 6
+
+static void example(unsigned char n)
+{
+ unsigned short i, j;
+ unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT];
+
+ if (n > EXAMPLES)
+ return;
+
+ printf("Example %i/%i - ", n, EXAMPLES);
+
+ switch (n) {
+ case 1:
+ printf("Draw points setting bits");
+ cfag12864b_clear();
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ cfag12864b_set(i, j);
+ break;
+
+ case 2:
+ printf("Clear the LCD");
+ cfag12864b_clear();
+ break;
+
+ case 3:
+ printf("Draw rows formatting a [128*64] matrix");
+ memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT);
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
+ matrix[j * CFAG12864B_WIDTH + i] = 1;
+ cfag12864b_format(matrix);
+ break;
+
+ case 4:
+ printf("Fill the lcd");
+ cfag12864b_fill();
+ break;
+
+ case 5:
+ printf("Draw columns unsetting bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i += 2)
+ for (j = 0; j < CFAG12864B_HEIGHT; j++)
+ cfag12864b_unset(i, j);
+ break;
+
+ case 6:
+ printf("Do negative not-ing all bits");
+ for (i = 0; i < CFAG12864B_WIDTH; i++)
+ for (j = 0; j < CFAG12864B_HEIGHT; j ++)
+ cfag12864b_not(i, j);
+ break;
+ }
+
+ puts(" - [Press Enter]");
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned char n;
+
+ if (argc != 2) {
+ printf(
+ "Syntax: %s fbdev\n"
+ "Usually: /dev/fb0, /dev/fb1...\n", argv[0]);
+ return -1;
+ }
+
+ if (cfag12864b_init(argv[1])) {
+ printf("Can't init %s fbdev\n", argv[1]);
+ return -2;
+ }
+
+ for (n = 1; n <= EXAMPLES; n++) {
+ example(n);
+ cfag12864b_blit();
+ while (getchar() != '\n');
+ }
+
+ cfag12864b_exit();
+
+ return 0;
+}
diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore
new file mode 100644
index 000000000000..8fa415a3640b
--- /dev/null
+++ b/samples/binderfs/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+/binderfs_example
diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile
new file mode 100644
index 000000000000..629e43b9b129
--- /dev/null
+++ b/samples/binderfs/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += binderfs_example
+
+userccflags += -I usr/include
diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c
new file mode 100644
index 000000000000..0fd92cdda460
--- /dev/null
+++ b/samples/binderfs/binderfs_example.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/android/binder.h>
+#include <linux/android/binderfs.h>
+
+int main(int argc, char *argv[])
+{
+ int fd, ret, saved_errno;
+ struct binderfs_device device = { 0 };
+
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to unshare mount namespace\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to mount / as private\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = mkdir("/dev/binderfs", 0755);
+ if (ret < 0 && errno != EEXIST) {
+ fprintf(stderr, "%s - Failed to create binderfs mountpoint\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = mount(NULL, "/dev/binderfs", "binder", 0, 0);
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to mount binderfs\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ memcpy(device.name, "my-binder", strlen("my-binder"));
+
+ fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "%s - Failed to open binder-control device\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ ret = ioctl(fd, BINDER_CTL_ADD, &device);
+ saved_errno = errno;
+ close(fd);
+ errno = saved_errno;
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to allocate new binder device\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ printf("Allocated new binder device with major %d, minor %d, and name %s\n",
+ device.major, device.minor, device.name);
+
+ ret = unlink("/dev/binderfs/my-binder");
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to delete binder device\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ /* Cleanup happens when the mount namespace dies. */
+ exit(EXIT_SUCCESS);
+}
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
new file mode 100644
index 000000000000..0002cd359fb1
--- /dev/null
+++ b/samples/bpf/.gitignore
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cpustat
+fds_example
+hbm
+ibumad
+lathist
+lwt_len_hist
+map_perf_test
+offwaketime
+per_socket_stats_example
+sampleip
+sock_example
+sockex1
+sockex2
+sockex3
+spintest
+syscall_nrs.h
+syscall_tp
+task_fd_query
+tc_l2_redirect
+test_cgrp2_array_pin
+test_cgrp2_attach
+test_cgrp2_attach2
+test_cgrp2_sock
+test_cgrp2_sock2
+test_current_task_under_cgroup
+test_lru_dist
+test_map_in_map
+test_overhead
+test_probe_write_user
+trace_event
+trace_output
+tracex1
+tracex2
+tracex3
+tracex4
+tracex5
+tracex6
+tracex7
+xdp_adjust_tail
+xdp_fwd
+xdp_router_ipv4
+xdp_tx_iptunnel
+testfile.img
+hbm_out.log
+iperf.*
+*.out
+*.skel.h
+/vmlinux.h
+/bpftool/
+/libbpf/
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
new file mode 100644
index 000000000000..95a4fa1f1e44
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: GPL-2.0
+
+BPF_SAMPLES_PATH ?= $(abspath $(src))
+TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools
+
+pound := \#
+
+# List of programs to build
+tprogs-y := test_lru_dist
+tprogs-y += sock_example
+tprogs-y += fds_example
+tprogs-y += sockex1
+tprogs-y += sockex2
+tprogs-y += sockex3
+tprogs-y += tracex1
+tprogs-y += tracex3
+tprogs-y += tracex4
+tprogs-y += tracex5
+tprogs-y += tracex6
+tprogs-y += trace_output
+tprogs-y += lathist
+tprogs-y += offwaketime
+tprogs-y += spintest
+tprogs-y += map_perf_test
+tprogs-y += xdp_router_ipv4
+tprogs-y += trace_event
+tprogs-y += sampleip
+tprogs-y += tc_l2_redirect
+tprogs-y += lwt_len_hist
+tprogs-y += xdp_tx_iptunnel
+tprogs-y += test_map_in_map
+tprogs-y += per_socket_stats_example
+tprogs-y += syscall_tp
+tprogs-y += cpustat
+tprogs-y += xdp_adjust_tail
+tprogs-y += xdp_fwd
+tprogs-y += task_fd_query
+tprogs-y += ibumad
+tprogs-y += hbm
+
+# Libbpf dependencies
+LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT = $(abspath $(BPF_SAMPLES_PATH))/libbpf
+LIBBPF_DESTDIR = $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
+LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a
+
+CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
+XDP_SAMPLE := xdp_sample_user.o
+
+fds_example-objs := fds_example.o
+sockex1-objs := sockex1_user.o
+sockex2-objs := sockex2_user.o
+sockex3-objs := sockex3_user.o
+tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
+tracex3-objs := tracex3_user.o
+tracex4-objs := tracex4_user.o
+tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
+tracex6-objs := tracex6_user.o
+trace_output-objs := trace_output_user.o
+lathist-objs := lathist_user.o
+offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := spintest_user.o $(TRACE_HELPERS)
+map_perf_test-objs := map_perf_test_user.o
+test_overhead-objs := test_overhead_user.o
+trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := sampleip_user.o $(TRACE_HELPERS)
+tc_l2_redirect-objs := tc_l2_redirect_user.o
+lwt_len_hist-objs := lwt_len_hist_user.o
+xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o
+test_map_in_map-objs := test_map_in_map_user.o
+per_socket_stats_example-objs := cookie_uid_helper_example.o
+syscall_tp-objs := syscall_tp_user.o
+cpustat-objs := cpustat_user.o
+xdp_adjust_tail-objs := xdp_adjust_tail_user.o
+xdp_fwd-objs := xdp_fwd_user.o
+task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS)
+ibumad-objs := ibumad_user.o
+hbm-objs := hbm.o $(CGROUP_HELPERS)
+
+xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE)
+
+# Tell kbuild to always build the programs
+always-y := $(tprogs-y)
+always-y += sockex1_kern.o
+always-y += sockex2_kern.o
+always-y += sockex3_kern.o
+always-y += tracex1.bpf.o
+always-y += tracex3.bpf.o
+always-y += tracex4.bpf.o
+always-y += tracex5.bpf.o
+always-y += tracex6.bpf.o
+always-y += trace_output.bpf.o
+always-y += tcbpf1_kern.o
+always-y += tc_l2_redirect_kern.o
+always-y += lathist_kern.o
+always-y += offwaketime.bpf.o
+always-y += spintest.bpf.o
+always-y += map_perf_test.bpf.o
+always-y += parse_varlen.o parse_simple.o parse_ldabs.o
+always-y += trace_event_kern.o
+always-y += sampleip_kern.o
+always-y += lwt_len_hist.bpf.o
+always-y += xdp_tx_iptunnel_kern.o
+always-y += test_map_in_map.bpf.o
+always-y += tcp_synrto_kern.o
+always-y += tcp_rwnd_kern.o
+always-y += tcp_bufs_kern.o
+always-y += tcp_cong_kern.o
+always-y += tcp_iw_kern.o
+always-y += tcp_clamp_kern.o
+always-y += tcp_basertt_kern.o
+always-y += tcp_tos_reflect_kern.o
+always-y += tcp_dumpstats_kern.o
+always-y += xdp2skb_meta_kern.o
+always-y += syscall_tp_kern.o
+always-y += cpustat_kern.o
+always-y += xdp_adjust_tail_kern.o
+always-y += xdp_fwd_kern.o
+always-y += task_fd_query_kern.o
+always-y += ibumad_kern.o
+always-y += hbm_out_kern.o
+always-y += hbm_edt_kern.o
+
+COMMON_CFLAGS = $(TPROGS_USER_CFLAGS)
+TPROGS_LDFLAGS = $(TPROGS_USER_LDFLAGS)
+
+ifeq ($(ARCH), arm)
+# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
+# headers when arm instruction set identification is requested.
+ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS))
+BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR)
+TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR)
+endif
+
+ifeq ($(ARCH), mips)
+TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__
+ifdef CONFIG_MACH_LOONGSON64
+BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64
+BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic
+endif
+endif
+
+ifeq ($(ARCH), x86)
+BPF_EXTRA_CFLAGS += -fcf-protection
+endif
+
+COMMON_CFLAGS += -Wall -O2
+COMMON_CFLAGS += -Wmissing-prototypes
+COMMON_CFLAGS += -Wstrict-prototypes
+COMMON_CFLAGS += $(call try-run,\
+ printf "int main() { return 0; }" |\
+ $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,)
+
+TPROGS_CFLAGS += $(COMMON_CFLAGS)
+TPROGS_CFLAGS += -I$(objtree)/usr/include
+TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
+TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE)
+TPROGS_CFLAGS += -I$(srctree)/tools/include
+TPROGS_CFLAGS += -I$(srctree)/tools/perf
+TPROGS_CFLAGS += -I$(srctree)/tools/lib
+TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0
+
+ifdef SYSROOT
+COMMON_CFLAGS += --sysroot=$(SYSROOT)
+TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib
+endif
+
+TPROGS_LDLIBS += $(LIBBPF) -lelf -lz
+TPROGLDLIBS_xdp_router_ipv4 += -lm -pthread
+TPROGLDLIBS_tracex4 += -lrt
+TPROGLDLIBS_trace_output += -lrt
+TPROGLDLIBS_map_perf_test += -lrt
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
+# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+OPT ?= opt
+LLVM_DIS ?= llvm-dis
+LLVM_OBJCOPY ?= llvm-objcopy
+LLVM_READELF ?= llvm-readelf
+BTF_PAHOLE ?= pahole
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+# Don't evaluate probes and warnings if we need to run make recursively
+ifneq ($(src),)
+HDR_PROBE := $(shell printf "$(pound)include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \
+ $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \
+ -o /dev/null 2>/dev/null && echo okay)
+
+ifeq ($(HDR_PROBE),)
+$(warning WARNING: Detected possible issues with include path.)
+$(warning WARNING: Please install kernel headers locally (make headers_install).)
+endif
+
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
+ $(CLANG) --target=bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \
+ $(LLVM_READELF) -S ./llvm_btf_verify.o | grep BTF; \
+ /bin/rm -f ./llvm_btf_verify.o)
+
+BPF_EXTRA_CFLAGS += -fno-stack-protector
+ifneq ($(BTF_LLVM_PROBE),)
+ BPF_EXTRA_CFLAGS += -g
+else
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+ BPF_EXTRA_CFLAGS += -g
+ LLC_FLAGS += -mattr=dwarfris
+ DWARF2BTF = y
+endif
+endif
+endif
+
+# Trick to allow make to be run from this directory
+all:
+ $(MAKE) -C ../../ M=$(CURDIR) BPF_SAMPLES_PATH=$(CURDIR)
+
+clean:
+ $(MAKE) -C ../../ M=$(CURDIR) clean
+ @find $(CURDIR) -type f -name '*~' -delete
+ @$(RM) -r $(CURDIR)/libbpf $(CURDIR)/bpftool
+
+$(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \
+ LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \
+ O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \
+ $@ install_headers
+
+BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abspath $(BPF_SAMPLES_PATH))/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) | $(BPFTOOL_OUTPUT)
+ $(MAKE) -C $(BPFTOOLDIR) srctree=$(BPF_SAMPLES_PATH)/../../ \
+ OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
+
+$(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $@
+
+$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE
+ $(call filechk,offsets,__SYSCALL_NRS_H__)
+
+targets += syscall_nrs.s
+clean-files += syscall_nrs.h
+
+FORCE:
+
+
+# Verify LLVM compiler tools are available and bpf target is supported by llc
+.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
+
+verify_cmds: $(CLANG) $(LLC)
+ @for TOOL in $^ ; do \
+ if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
+ echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
+ exit 1; \
+ else true; fi; \
+ done
+
+verify_target_bpf: verify_cmds
+ @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
+ echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
+ echo " NOTICE: LLVM version >= 3.7.1 required" ;\
+ exit 2; \
+ else true; fi
+
+$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
+$(src)/*.c: verify_target_bpf $(LIBBPF)
+
+libbpf_hdrs: $(LIBBPF)
+$(obj)/$(TRACE_HELPERS) $(obj)/$(CGROUP_HELPERS) $(obj)/$(XDP_SAMPLE): | libbpf_hdrs
+
+.PHONY: libbpf_hdrs
+
+$(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h
+
+$(obj)/tracex5.bpf.o: $(obj)/syscall_nrs.h
+$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+$(obj)/hbm.o: $(src)/hbm.h
+$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+
+# Override includes for xdp_sample_user.o because $(srctree)/usr/include in
+# TPROGS_CFLAGS causes conflicts
+XDP_SAMPLE_CFLAGS += -Wall -O2 \
+ -I$(src)/../../tools/include \
+ -I$(src)/../../tools/include/uapi \
+ -I$(LIBBPF_INCLUDE) \
+ -I$(src)/../../tools/testing/selftests/bpf
+
+$(obj)/$(XDP_SAMPLE): TPROGS_CFLAGS = $(XDP_SAMPLE_CFLAGS) $(TPROGS_USER_CFLAGS)
+$(obj)/$(XDP_SAMPLE): $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h
+# Override includes for trace_helpers.o because __must_check won't be defined
+# in our include path.
+$(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check=
+
+-include $(BPF_SAMPLES_PATH)/Makefile.target
+
+VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \
+ $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \
+ $(abspath $(objtree)/vmlinux)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+
+$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
+ifeq ($(VMLINUX_H),)
+ifeq ($(VMLINUX_BTF),)
+ $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\
+ build the kernel or set VMLINUX_BTF like "VMLINUX_BTF=/sys/kernel/btf/vmlinux" or VMLINUX_H variable)
+endif
+ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+ $(Q)cp "$(VMLINUX_H)" $@
+endif
+
+clean-files += vmlinux.h
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '--target=bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/')
+endef
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
+
+$(obj)/xdp_router_ipv4.bpf.o: $(obj)/xdp_sample.bpf.o
+
+$(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h
+ @echo " CLANG-BPF " $@
+ $(Q)$(CLANG) -g -O2 --target=bpf -D__TARGET_ARCH_$(SRCARCH) \
+ -Wno-compare-distinct-pointer-types -I$(srctree)/include \
+ -I$(srctree)/samples/bpf -I$(srctree)/tools/include \
+ -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \
+ -c $(filter %.bpf.c,$^) -o $@
+
+LINKED_SKELS := xdp_router_ipv4.skel.h
+clean-files += $(LINKED_SKELS)
+
+xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o
+
+LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
+
+BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c))
+BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED))
+BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS))
+
+$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL)
+ @echo " BPF GEN-OBJ " $(@:.skel.h=)
+ $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps))
+ @echo " BPF GEN-SKEL" $(@:.skel.h=)
+ $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@
+
+# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
+# But, there is no easy way to fix it, so just exclude it since it is
+# useless for BPF samples.
+# below we use long chain of commands, clang | opt | llvm-dis | llc,
+# to generate final object file. 'clang' compiles the source into IR
+# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
+# processing (llvm12) and IR optimizations. 'llvm-dis' converts
+# 'opt' output to IR, and finally 'llc' generates bpf byte code.
+$(obj)/%.o: $(src)/%.c
+ @echo " CLANG-bpf " $@
+ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
+ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \
+ -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \
+ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \
+ -Wno-gnu-variable-sized-type-not-at-end \
+ -Wno-address-of-packed-member -Wno-tautological-compare \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+ -fno-asynchronous-unwind-tables \
+ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
+ -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
+ $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
+ $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+ $(BTF_PAHOLE) -J $@
+endif
diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target
new file mode 100644
index 000000000000..7621f55e2947
--- /dev/null
+++ b/samples/bpf/Makefile.target
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+# ==========================================================================
+# Building binaries on the host system
+# Binaries are not used during the compilation of the kernel, and intended
+# to be build for target board, target board can be host of course. Added to
+# build binaries to run not on host system.
+#
+# Sample syntax
+# tprogs-y := xsk_example
+# Will compile xsk_example.c and create an executable named xsk_example
+#
+# tprogs-y := xdpsock
+# xdpsock-objs := xdpsock_1.o xdpsock_2.o
+# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable
+# xdpsock, based on xdpsock_1.o and xdpsock_2.o
+#
+# Derived from scripts/Makefile.host
+#
+__tprogs := $(sort $(tprogs-y))
+
+# C code
+# Executables compiled from a single .c file
+tprog-csingle := $(foreach m,$(__tprogs), \
+ $(if $($(m)-objs),,$(m)))
+
+# C executables linked based on several .o files
+tprog-cmulti := $(foreach m,$(__tprogs),\
+ $(if $($(m)-objs),$(m)))
+
+# Object (.o) files compiled from .c files
+tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs)))
+
+tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle))
+tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti))
+tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs))
+
+#####
+# Handle options to gcc. Support building with separate output directory
+
+_tprogc_flags = $(TPROGS_CFLAGS) \
+ $(TPROGCFLAGS_$(basetarget).o)
+
+# $(objtree)/$(obj) for including generated headers from checkin source files
+ifeq ($(KBUILD_EXTMOD),)
+ifdef building_out_of_srctree
+_tprogc_flags += -I $(objtree)/$(obj)
+endif
+endif
+
+tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags)
+
+# Create executable from a single .c file
+# tprog-csingle -> Executable
+quiet_cmd_tprog-csingle = CC $@
+ cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \
+ $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
+$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE
+ $(call if_changed_dep,tprog-csingle)
+
+# Link an executable based on list of .o files, all plain c
+# tprog-cmulti -> executable
+quiet_cmd_tprog-cmulti = LD $@
+ cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \
+ $(addprefix $(obj)/,$($(@F)-objs)) \
+ $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
+$(tprog-cmulti): $(tprog-cobjs) FORCE
+ $(call if_changed,tprog-cmulti)
+$(call multi_depend, $(tprog-cmulti), , -objs)
+
+# Create .o file from a single .c file
+# tprog-cobjs -> .o
+quiet_cmd_tprog-cobjs = CC $@
+ cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $<
+$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE
+ $(call if_changed_dep,tprog-cobjs)
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
new file mode 100644
index 000000000000..cabe2d216997
--- /dev/null
+++ b/samples/bpf/README.rst
@@ -0,0 +1,121 @@
+eBPF sample programs
+====================
+
+This directory contains a test stubs, verifier test-suite and examples
+for using eBPF. The examples use libbpf from tools/lib/bpf.
+
+Note that the XDP-specific samples have been removed from this directory and
+moved to the xdp-tools repository: https://github.com/xdp-project/xdp-tools
+See the commit messages removing each tool from this directory for how to
+convert specific command invocations between the old samples and the utilities
+in xdp-tools.
+
+Build dependencies
+==================
+
+Compiling requires having installed:
+ * clang
+ * llvm
+ * pahole
+
+Consult :ref:`Documentation/process/changes.rst <changes>` for the minimum
+version numbers required and how to update them. Note that LLVM's tool
+'llc' must support target 'bpf', list version and supported targets with
+command: ``llc --version``
+
+Clean and configuration
+-----------------------
+
+It can be needed to clean tools, samples or kernel before trying new arch or
+after some changes (on demand)::
+
+ make -C tools clean
+ make -C samples/bpf clean
+ make clean
+
+Configure kernel, defconfig for instance
+(see "tools/testing/selftests/bpf/config" for a reference config)::
+
+ make defconfig
+
+Kernel headers
+--------------
+
+There are usually dependencies to header files of the current kernel.
+To avoid installing devel kernel headers system wide, as a normal
+user, simply call::
+
+ make headers_install
+
+This will create a local "usr/include" directory in the git/build top
+level directory, that the make system will automatically pick up first.
+
+Compiling
+=========
+
+For building the BPF samples, issue the below command from the kernel
+top level directory::
+
+ make M=samples/bpf
+
+It is also possible to call make from this directory. This will just
+hide the invocation of make as above.
+
+Manually compiling LLVM with 'bpf' support
+------------------------------------------
+
+Since version 3.7.0, LLVM adds a proper LLVM backend target for the
+BPF bytecode architecture.
+
+By default llvm will build all non-experimental backends including bpf.
+To generate a smaller llc binary one can use::
+
+ -DLLVM_TARGETS_TO_BUILD="BPF"
+
+We recommend that developers who want the fastest incremental builds
+use the Ninja build system, you can find it in your system's package
+manager, usually the package is ninja or ninja-build.
+
+Quick sniplet for manually compiling LLVM and clang
+(build dependencies are ninja, cmake and gcc-c++)::
+
+ $ git clone https://github.com/llvm/llvm-project.git
+ $ mkdir -p llvm-project/llvm/build
+ $ cd llvm-project/llvm/build
+ $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
+ -DLLVM_ENABLE_PROJECTS="clang" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_BUILD_RUNTIME=OFF
+ $ ninja
+
+It is also possible to point make to the newly compiled 'llc' or
+'clang' command via redefining LLC or CLANG on the make command line::
+
+ make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang
+
+Cross compiling samples
+-----------------------
+In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
+environment variables before calling make. But do this before clean,
+configuration and header install steps described above. This will direct make to
+build samples for the cross target::
+
+ export ARCH=arm64
+ export CROSS_COMPILE="aarch64-linux-gnu-"
+
+Headers can be also installed on RFS of target board if need to keep them in
+sync (not necessarily and it creates a local "usr/include" directory also)::
+
+ make INSTALL_HDR_PATH=~/some_sysroot/usr headers_install
+
+Pointing LLC and CLANG is not necessarily if it's installed on HOST and have
+in its targets appropriate arm64 arch (usually it has several arches).
+Build samples::
+
+ make M=samples/bpf
+
+Or build samples with SYSROOT if some header or library is absent in toolchain,
+say libelf, providing address to file system containing headers and libs,
+can be RFS of target board::
+
+ make M=samples/bpf SYSROOT=~/some_sysroot
diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h
new file mode 100644
index 000000000000..634e81d83efd
--- /dev/null
+++ b/samples/bpf/asm_goto_workaround.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 Facebook */
+#ifndef __ASM_GOTO_WORKAROUND_H
+#define __ASM_GOTO_WORKAROUND_H
+
+/*
+ * This will bring in asm_goto_output and asm_inline macro definitions
+ * if enabled by compiler and config options.
+ */
+#include <linux/types.h>
+
+#ifdef asm_goto_output
+#undef asm_goto_output
+#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output")
+#endif
+
+/*
+ * asm_inline is defined as asm __inline in "include/linux/compiler_types.h"
+ * if supported by the kernel's CC (i.e CONFIG_CC_HAS_ASM_INLINE) which is not
+ * supported by CLANG.
+ */
+#ifdef asm_inline
+#undef asm_inline
+#define asm_inline asm
+#endif
+
+#define volatile(x...) volatile("")
+#endif
diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h
new file mode 100644
index 000000000000..29c3bb6ad1cd
--- /dev/null
+++ b/samples/bpf/bpf_insn.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* eBPF instruction mini library */
+#ifndef __BPF_INSN_H
+#define __BPF_INSN_H
+
+struct bpf_insn;
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_MOV32_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD 1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/*
+ * Atomic operations:
+ *
+ * BPF_ADD *(uint *) (dst_reg + off16) += src_reg
+ * BPF_AND *(uint *) (dst_reg + off16) &= src_reg
+ * BPF_OR *(uint *) (dst_reg + off16) |= src_reg
+ * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg
+ * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
+ * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
+ * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
+ * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
+ */
+
+#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = OP })
+
+/* Legacy alias */
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */
+
+#define BPF_JMP32_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */
+
+#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
+#endif
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
new file mode 100644
index 000000000000..f0df3dda4b1f
--- /dev/null
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -0,0 +1,332 @@
+/* This test is a demo of using get_socket_uid and get_socket_cookie
+ * helper function to do per socket based network traffic monitoring.
+ * It requires iptables version higher then 1.6.1. to load pinned eBPF
+ * program into the xt_bpf match.
+ *
+ * TEST:
+ * ./run_cookie_uid_helper_example.sh -option
+ * option:
+ * -t: do traffic monitoring test, the program will continuously
+ * print out network traffic happens after program started A sample
+ * output is shown below:
+ *
+ * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
+ * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286
+ * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726
+ * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
+ * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
+ * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
+ * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712
+ * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70
+ *
+ * -s: do getsockopt SO_COOKIE test, the program will set up a pair of
+ * UDP sockets and send packets between them. And read out the traffic data
+ * directly from the ebpf map based on the socket cookie.
+ *
+ * Clean up: if using shell script, the script file will delete the iptables
+ * rule and unmount the bpf program when exit. Else the iptables rule need
+ * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail.
+ */
+
+#define _GNU_SOURCE
+
+#define offsetof(type, member) __builtin_offsetof(type, member)
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <limits.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
+
+#define PORT 8888
+
+struct stats {
+ uint32_t uid;
+ uint64_t packets;
+ uint64_t bytes;
+};
+
+static int map_fd, prog_fd;
+
+static bool test_finish;
+
+static void maps_create(void)
+{
+ map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t),
+ sizeof(struct stats), 100, NULL);
+ if (map_fd < 0)
+ error(1, errno, "map create failed!\n");
+}
+
+static void prog_load(void)
+{
+ static char log_buf[1 << 16];
+
+ struct bpf_insn prog[] = {
+ /*
+ * Save sk_buff for future usage. value stored in R6 to R10 will
+ * not be reset after a bpf helper function call.
+ */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /*
+ * pc1: BPF_FUNC_get_socket_cookie takes one parameter,
+ * R1: sk_buff
+ */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_cookie),
+ /* pc2-4: save &socketCookie to r7 for future usage*/
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /*
+ * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem,
+ * it takes two parameters (R1: map_fd, R2: &socket_cookie)
+ */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ /*
+ * pc9. if r0 != 0x0, go to pc+14, since we have the cookie
+ * stored already
+ * Otherwise do pc10-22 to setup a new data entry.
+ */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_uid),
+ /*
+ * Place a struct stats in the R10 stack and sequentially
+ * place the member value into the memory. Packets value
+ * is set by directly place a IMM value 1 into the stack.
+ */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0,
+ -32 + (__s16)offsetof(struct stats, uid)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10,
+ -32 + (__s16)offsetof(struct stats, packets), 1),
+ /*
+ * __sk_buff is a special struct used for eBPF program to
+ * directly access some sk_buff field.
+ */
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1,
+ -32 + (__s16)offsetof(struct stats, bytes)),
+ /*
+ * add new map entry using BPF_FUNC_map_update_elem, it takes
+ * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats,
+ * R4: flags)
+ */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_update_elem),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 5),
+ /*
+ * pc24-30 update the packet info to a exist data entry, it can
+ * be done by directly write to pointers instead of using
+ * BPF_FUNC_map_update_elem helper function
+ */
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1,
+ offsetof(struct stats, packets)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1,
+ offsetof(struct stats, bytes)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_EXIT_INSN(),
+ };
+ LIBBPF_OPTS(bpf_prog_load_opts, opts,
+ .log_buf = log_buf,
+ .log_size = sizeof(log_buf),
+ );
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+ prog, ARRAY_SIZE(prog), &opts);
+ if (prog_fd < 0)
+ error(1, errno, "failed to load prog\n%s\n", log_buf);
+}
+
+static void prog_attach_iptables(char *file)
+{
+ int ret;
+ char rules[256];
+
+ if (bpf_obj_pin(prog_fd, file))
+ error(1, errno, "bpf_obj_pin");
+ if (strlen(file) > 50) {
+ printf("file path too long: %s\n", file);
+ exit(1);
+ }
+ ret = snprintf(rules, sizeof(rules),
+ "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT",
+ file);
+ if (ret < 0 || ret >= sizeof(rules)) {
+ printf("error constructing iptables command\n");
+ exit(1);
+ }
+ ret = system(rules);
+ if (ret < 0) {
+ printf("iptables rule update failed: %d/n", WEXITSTATUS(ret));
+ exit(1);
+ }
+}
+
+static void print_table(void)
+{
+ struct stats curEntry;
+ uint32_t curN = UINT32_MAX;
+ uint32_t nextN;
+ int res;
+
+ while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) {
+ curN = nextN;
+ res = bpf_map_lookup_elem(map_fd, &curN, &curEntry);
+ if (res < 0) {
+ error(1, errno, "fail to get entry value of Key: %u\n",
+ curN);
+ } else {
+ printf("cookie: %u, uid: 0x%x, Packet Count: %lu,"
+ " Bytes Count: %lu\n", curN, curEntry.uid,
+ curEntry.packets, curEntry.bytes);
+ }
+ }
+}
+
+static void udp_client(void)
+{
+ struct sockaddr_in si_other = {0};
+ struct sockaddr_in si_me = {0};
+ struct stats dataEntry;
+ int s_rcv, s_send, i, recv_len;
+ char message = 'a';
+ char buf;
+ uint64_t cookie;
+ int res;
+ socklen_t cookie_len = sizeof(cookie);
+ socklen_t slen = sizeof(si_other);
+
+ s_rcv = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s_rcv < 0)
+ error(1, errno, "rcv socket creat failed!\n");
+ si_other.sin_family = AF_INET;
+ si_other.sin_port = htons(PORT);
+ if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0)
+ error(1, errno, "inet_aton\n");
+ if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1)
+ error(1, errno, "bind\n");
+ s_send = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s_send < 0)
+ error(1, errno, "send socket creat failed!\n");
+ res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len);
+ if (res < 0)
+ printf("get cookie failed: %s\n", strerror(errno));
+ res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
+ if (res != -1)
+ error(1, errno, "socket stat found while flow not active\n");
+ for (i = 0; i < 10; i++) {
+ res = sendto(s_send, &message, sizeof(message), 0,
+ (struct sockaddr *)&si_other, slen);
+ if (res == -1)
+ error(1, errno, "send\n");
+ if (res != sizeof(message))
+ error(1, 0, "%uB != %luB\n", res, sizeof(message));
+ recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0,
+ (struct sockaddr *)&si_me, &slen);
+ if (recv_len < 0)
+ error(1, errno, "receive\n");
+ res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr),
+ sizeof(si_me.sin_addr));
+ if (res != 0)
+ error(1, EFAULT, "sender addr error: %d\n", res);
+ printf("Message received: %c\n", buf);
+ res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
+ if (res < 0)
+ error(1, errno, "lookup sk stat failed, cookie: %lu\n",
+ cookie);
+ printf("cookie: %lu, uid: 0x%x, Packet Count: %lu,"
+ " Bytes Count: %lu\n\n", cookie, dataEntry.uid,
+ dataEntry.packets, dataEntry.bytes);
+ }
+ close(s_send);
+ close(s_rcv);
+}
+
+static int usage(void)
+{
+ printf("Usage: ./run_cookie_uid_helper_example.sh"
+ " bpfObjName -option\n"
+ " -t traffic monitor test\n"
+ " -s getsockopt cookie test\n");
+ return 1;
+}
+
+static void finish(int ret)
+{
+ test_finish = true;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ bool cfg_test_traffic = false;
+ bool cfg_test_cookie = false;
+
+ if (argc != 3)
+ return usage();
+ while ((opt = getopt(argc, argv, "ts")) != -1) {
+ switch (opt) {
+ case 't':
+ cfg_test_traffic = true;
+ break;
+ case 's':
+ cfg_test_cookie = true;
+ break;
+
+ default:
+ printf("unknown option %c\n", opt);
+ usage();
+ return -1;
+ }
+ }
+ maps_create();
+ prog_load();
+ prog_attach_iptables(argv[2]);
+ if (cfg_test_traffic) {
+ if (signal(SIGINT, finish) == SIG_ERR)
+ error(1, errno, "register SIGINT handler failed");
+ if (signal(SIGTERM, finish) == SIG_ERR)
+ error(1, errno, "register SIGTERM handler failed");
+ while (!test_finish) {
+ print_table();
+ printf("\n");
+ sleep(1);
+ }
+ } else if (cfg_test_cookie) {
+ udp_client();
+ }
+ close(prog_fd);
+ close(map_fd);
+ return 0;
+}
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000000..7ec7143e2757
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+/*
+ * The CPU number, cstate number and pstate number are based
+ * on 96boards Hikey with octa CA53 CPUs.
+ *
+ * Every CPU have three idle states for cstate:
+ * WFI, CPU_OFF, CLUSTER_OFF
+ *
+ * Every CPU have 5 operating points:
+ * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
+ *
+ * This code is based on these assumption and other platforms
+ * need to adjust these definitions.
+ */
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+
+static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
+
+/*
+ * my_map structure is used to record cstate and pstate index and
+ * timestamp (Idx, Ts), when new event incoming we need to update
+ * combination for new state index and timestamp (Idx`, Ts`).
+ *
+ * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
+ * interval for the previous state: Duration(Idx) = Ts` - Ts.
+ *
+ * Every CPU has one below array for recording state index and
+ * timestamp, and record for cstate and pstate saperately:
+ *
+ * +--------------------------+
+ * | cstate timestamp |
+ * +--------------------------+
+ * | cstate index |
+ * +--------------------------+
+ * | pstate timestamp |
+ * +--------------------------+
+ * | pstate index |
+ * +--------------------------+
+ */
+#define MAP_OFF_CSTATE_TIME 0
+#define MAP_OFF_CSTATE_IDX 1
+#define MAP_OFF_PSTATE_TIME 2
+#define MAP_OFF_PSTATE_IDX 3
+#define MAP_OFF_NUM 4
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAP_OFF_NUM);
+} my_map SEC(".maps");
+
+/* cstate_duration records duration time for every idle state per CPU */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
+} cstate_duration SEC(".maps");
+
+/* pstate_duration records duration time for every operating point per CPU */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
+} pstate_duration SEC(".maps");
+
+/*
+ * The trace events for cpu_idle and cpu_frequency are taken from:
+ * /sys/kernel/tracing/events/power/cpu_idle/format
+ * /sys/kernel/tracing/events/power/cpu_frequency/format
+ *
+ * These two events have same format, so define one common structure.
+ */
+struct cpu_args {
+ u64 pad;
+ u32 state;
+ u32 cpu_id;
+};
+
+/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
+static u32 find_cpu_pstate_idx(u32 frequency)
+{
+ u32 i;
+
+ for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
+ if (frequency == cpu_opps[i])
+ return i;
+ }
+
+ return i;
+}
+
+SEC("tracepoint/power/cpu_idle")
+int bpf_prog1(struct cpu_args *ctx)
+{
+ u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ if (ctx->cpu_id > MAX_CPU)
+ return 0;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
+ cts = bpf_map_lookup_elem(&my_map, &key);
+ if (!cts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ prev_state = *cstate;
+ *cstate = ctx->state;
+
+ if (!*cts) {
+ *cts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *cts;
+ *cts = cur_ts;
+
+ /*
+ * When state doesn't equal to (u32)-1, the cpu will enter
+ * one idle state; for this case we need to record interval
+ * for the pstate.
+ *
+ * OPP2
+ * +---------------------+
+ * OPP1 | |
+ * ---------+ |
+ * | Idle state
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ if (ctx->state != (u32)-1) {
+
+ /* record pstate after have first cpu_frequency event */
+ if (!*pts)
+ return 0;
+
+ delta = cur_ts - *pts;
+
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ /*
+ * When state equal to (u32)-1, the cpu just exits from one
+ * specific idle state; for this case we need to record
+ * interval for the pstate.
+ *
+ * OPP2
+ * -----------+
+ * | OPP1
+ * | +-----------
+ * | Idle state |
+ * +---------------------+
+ *
+ * |<- cstate duration ->|
+ * ^ ^
+ * cts cur_ts
+ */
+ } else {
+
+ key = cpu * MAX_CSTATE_ENTRIES + prev_state;
+ val = bpf_map_lookup_elem(&cstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+ }
+
+ /* Update timestamp for pstate as new start time */
+ if (*pts)
+ *pts = cur_ts;
+
+ return 0;
+}
+
+SEC("tracepoint/power/cpu_frequency")
+int bpf_prog2(struct cpu_args *ctx)
+{
+ u64 *pts, *cstate, *pstate, cur_ts, delta;
+ u32 key, cpu, pstate_idx;
+ u64 *val;
+
+ cpu = ctx->cpu_id;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+ pts = bpf_map_lookup_elem(&my_map, &key);
+ if (!pts)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+ pstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!pstate)
+ return 0;
+
+ key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+ cstate = bpf_map_lookup_elem(&my_map, &key);
+ if (!cstate)
+ return 0;
+
+ *pstate = ctx->state;
+
+ if (!*pts) {
+ *pts = bpf_ktime_get_ns();
+ return 0;
+ }
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = cur_ts - *pts;
+ *pts = cur_ts;
+
+ /* When CPU is in idle, bail out to skip pstate statistics */
+ if (*cstate != (u32)(-1))
+ return 0;
+
+ /*
+ * The cpu changes to another different OPP (in below diagram
+ * change frequency from OPP3 to OPP1), need recording interval
+ * for previous frequency OPP3 and update timestamp as start
+ * time for new frequency OPP1.
+ *
+ * OPP3
+ * +---------------------+
+ * OPP2 | |
+ * ---------+ |
+ * | OPP1
+ * +---------------
+ *
+ * |<- pstate duration ->|
+ * ^ ^
+ * pts cur_ts
+ */
+ pstate_idx = find_cpu_pstate_idx(*pstate);
+ if (pstate_idx >= MAX_PSTATE_ENTRIES)
+ return 0;
+
+ key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+ val = bpf_map_lookup_elem(&pstate_duration, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, delta);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000000..356f756cba0d
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+static int cstate_map_fd, pstate_map_fd;
+
+#define MAX_CPU 8
+#define MAX_PSTATE_ENTRIES 5
+#define MAX_CSTATE_ENTRIES 3
+#define MAX_STARS 40
+
+#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
+#define CPUFREQ_LOWEST_FREQ "208000"
+#define CPUFREQ_HIGHEST_FREQ "12000000"
+
+struct cpu_stat_data {
+ unsigned long cstate[MAX_CSTATE_ENTRIES];
+ unsigned long pstate[MAX_PSTATE_ENTRIES];
+};
+
+static struct cpu_stat_data stat_data[MAX_CPU];
+
+static void cpu_stat_print(void)
+{
+ int i, j;
+ char state_str[sizeof("cstate-9")];
+ struct cpu_stat_data *data;
+
+ /* Clear screen */
+ printf("\033[2J");
+
+ /* Header */
+ printf("\nCPU states statistics:\n");
+ printf("%-10s ", "state(ms)");
+
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ sprintf(state_str, "cstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ sprintf(state_str, "pstate-%d", i);
+ printf("%-11s ", state_str);
+ }
+
+ printf("\n");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ data = &stat_data[j];
+
+ printf("CPU-%-6d ", j);
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
+ printf("%-11lu ", data->cstate[i] / 1000000);
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
+ printf("%-11lu ", data->pstate[i] / 1000000);
+
+ printf("\n");
+ }
+}
+
+static void cpu_stat_update(int cstate_fd, int pstate_fd)
+{
+ unsigned long key, value;
+ int c, i;
+
+ for (c = 0; c < MAX_CPU; c++) {
+ for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+ key = c * MAX_CSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(cstate_fd, &key, &value);
+ stat_data[c].cstate[i] = value;
+ }
+
+ for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+ key = c * MAX_PSTATE_ENTRIES + i;
+ bpf_map_lookup_elem(pstate_fd, &key, &value);
+ stat_data[c].pstate[i] = value;
+ }
+ }
+}
+
+/*
+ * This function is copied from 'idlestat' tool function
+ * idlestat_wake_all() in idlestate.c.
+ *
+ * It sets the self running task affinity to cpus one by one so can wake up
+ * the specific CPU to handle scheduling; this results in all cpus can be
+ * waken up once and produce ftrace event 'trace_cpu_idle'.
+ */
+static int cpu_stat_inject_cpu_idle_event(void)
+{
+ int rcpu, i, ret;
+ cpu_set_t cpumask;
+ cpu_set_t original_cpumask;
+
+ ret = sysconf(_SC_NPROCESSORS_CONF);
+ if (ret < 0)
+ return -1;
+
+ rcpu = sched_getcpu();
+ if (rcpu < 0)
+ return -1;
+
+ /* Keep track of the CPUs we will run on */
+ sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
+
+ for (i = 0; i < ret; i++) {
+
+ /* Pointless to wake up ourself */
+ if (i == rcpu)
+ continue;
+
+ /* Pointless to wake CPUs we will not run on */
+ if (!CPU_ISSET(i, &original_cpumask))
+ continue;
+
+ CPU_ZERO(&cpumask);
+ CPU_SET(i, &cpumask);
+
+ sched_setaffinity(0, sizeof(cpumask), &cpumask);
+ }
+
+ /* Enable all the CPUs of the original mask */
+ sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
+ return 0;
+}
+
+/*
+ * It's possible to have no any frequency change for long time and cannot
+ * get ftrace event 'trace_cpu_frequency' for long period, this introduces
+ * big deviation for pstate statistics.
+ *
+ * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
+ * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
+ * the maximum frequency value 1.2GHz.
+ */
+static int cpu_stat_inject_cpu_frequency_event(void)
+{
+ int len, fd;
+
+ fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
+ if (fd < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ return fd;
+ }
+
+ len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+ len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
+ if (len < 0) {
+ printf("failed to open scaling_max_freq, errno=%d\n", errno);
+ goto err;
+ }
+
+err:
+ close(fd);
+ return len;
+}
+
+static void int_exit(int sig)
+{
+ cpu_stat_inject_cpu_idle_event();
+ cpu_stat_inject_cpu_frequency_event();
+ cpu_stat_update(cstate_map_fd, pstate_map_fd);
+ cpu_stat_print();
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int ret;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration");
+ pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration");
+ if (cstate_map_fd < 0 || pstate_map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ ret = cpu_stat_inject_cpu_idle_event();
+ if (ret < 0)
+ return 1;
+
+ ret = cpu_stat_inject_cpu_frequency_event();
+ if (ret < 0)
+ return 1;
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (1) {
+ cpu_stat_update(cstate_map_fd, pstate_map_fd);
+ cpu_stat_print();
+ sleep(5);
+ }
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..7f4f722787d5
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,438 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+
+Usage() {
+ echo "Script for testing HBM (Host Bandwidth Manager) framework."
+ echo "It creates a cgroup to use for testing and load a BPF program to limit"
+ echo "egress or ingress bandwidth. It then uses iperf3 or netperf to create"
+ echo "loads. The output is the goodput in Mbps (unless -D was used)."
+ echo ""
+ echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
+ echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
+ echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
+ echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
+ echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
+ echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
+ echo " Where:"
+ echo " out egress (default)"
+ echo " -b or --bpf BPF program filename to load and attach."
+ echo " Default is hbm_out_kern.o for egress,"
+ echo " -c or -cc TCP congestion control (cubic or dctcp)"
+ echo " --debug print BPF trace buffer"
+ echo " -d or --delay add a delay in ms using netem"
+ echo " -D In addition to the goodput in Mbps, it also outputs"
+ echo " other detailed information. This information is"
+ echo " test dependent (i.e. iperf3 or netperf)."
+ echo " -E enable ECN (not required for dctcp)"
+ echo " --edt use fq's Earliest Departure Time (requires fq)"
+ echo " -f or --flows number of concurrent flows (default=1)"
+ echo " -i or --id cgroup id (an integer, default is 1)"
+ echo " -N use netperf instead of iperf3"
+ echo " --no_cn Do not return CN notifications"
+ echo " -l do not limit flows using loopback"
+ echo " -h Help"
+ echo " -p or --port iperf3 port (default is 5201)"
+ echo " -P use an iperf3 instance for each flow"
+ echo " -q use the specified qdisc"
+ echo " -r or --rate rate in Mbps (default 1s 1Gbps)"
+ echo " -R Use TCP_RR for netperf. 1st flow has req"
+ echo " size of 10KB, rest of 1MB. Reply in all"
+ echo " cases is 1 byte."
+ echo " More detailed output for each flow can be found"
+ echo " in the files netperf.<cg>.<flow>, where <cg> is the"
+ echo " cgroup id as specified with the -i flag, and <flow>"
+ echo " is the flow id starting at 1 and increasing by 1 for"
+ echo " flow (as specified by -f)."
+ echo " -s or --server hostname of netperf server. Used to create netperf"
+ echo " test traffic between to hosts (default is within host)"
+ echo " netserver must be running on the host."
+ echo " -S or --stats whether to update hbm stats (default is yes)."
+ echo " -t or --time duration of iperf3 in seconds (default=5)"
+ echo " -w Work conserving flag. cgroup can increase its"
+ echo " bandwidth beyond the rate limit specified"
+ echo " while there is available bandwidth. Current"
+ echo " implementation assumes there is only one NIC"
+ echo " (eth0), but can be extended to support multiple"
+ echo " NICs."
+ echo " cubic or dctcp specify which TCP CC to use"
+ echo " "
+ exit
+}
+
+#set -x
+
+debug_flag=0
+args="$@"
+name="$0"
+netem=0
+cc=x
+dir="-o"
+dir_name="out"
+dur=5
+flows=1
+id=1
+prog=""
+port=5201
+rate=1000
+multi_iperf=0
+flow_cnt=1
+use_netperf=0
+rr=0
+ecn=0
+details=0
+server=""
+qdisc=""
+flags=""
+do_stats=0
+
+BPFFS=/sys/fs/bpf
+function config_bpffs () {
+ if mount | grep $BPFFS > /dev/null; then
+ echo "bpffs already mounted"
+ else
+ echo "bpffs not mounted. Mounting..."
+ mount -t bpf none $BPFFS
+ fi
+}
+
+function start_hbm () {
+ rm -f hbm.out
+ echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
+ echo " " >> hbm.out
+ ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 &
+ echo $!
+}
+
+processArgs () {
+ for i in $args ; do
+ case $i in
+ # Support for upcoming ingress rate limiting
+ #in) # support for upcoming ingress rate limiting
+ # dir="-i"
+ # dir_name="in"
+ # ;;
+ out)
+ dir="-o"
+ dir_name="out"
+ ;;
+ -b=*|--bpf=*)
+ prog="${i#*=}"
+ ;;
+ -c=*|--cc=*)
+ cc="${i#*=}"
+ ;;
+ --no_cn)
+ flags="$flags --no_cn"
+ ;;
+ --debug)
+ flags="$flags -d"
+ debug_flag=1
+ ;;
+ -d=*|--delay=*)
+ netem="${i#*=}"
+ ;;
+ -D)
+ details=1
+ ;;
+ -E)
+ ecn=1
+ ;;
+ --edt)
+ flags="$flags --edt"
+ qdisc="fq"
+ ;;
+ -f=*|--flows=*)
+ flows="${i#*=}"
+ ;;
+ -i=*|--id=*)
+ id="${i#*=}"
+ ;;
+ -l)
+ flags="$flags -l"
+ ;;
+ -N)
+ use_netperf=1
+ ;;
+ -p=*|--port=*)
+ port="${i#*=}"
+ ;;
+ -P)
+ multi_iperf=1
+ ;;
+ -q=*)
+ qdisc="${i#*=}"
+ ;;
+ -r=*|--rate=*)
+ rate="${i#*=}"
+ ;;
+ -R)
+ rr=1
+ ;;
+ -s=*|--server=*)
+ server="${i#*=}"
+ ;;
+ -S|--stats)
+ flags="$flags -s"
+ do_stats=1
+ ;;
+ -t=*|--time=*)
+ dur="${i#*=}"
+ ;;
+ -w)
+ flags="$flags -w"
+ ;;
+ cubic)
+ cc=cubic
+ ;;
+ dctcp)
+ cc=dctcp
+ ;;
+ *)
+ echo "Unknown arg:$i"
+ Usage
+ ;;
+ esac
+ done
+}
+
+processArgs
+config_bpffs
+
+if [ $debug_flag -eq 1 ] ; then
+ rm -f hbm_out.log
+fi
+
+hbm_pid=$(start_hbm)
+usleep 100000
+
+host=`hostname`
+cg_base_dir=/sys/fs/cgroup/unified
+cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
+
+echo $$ >> $cg_dir/cgroup.procs
+
+ulimit -l unlimited
+
+rm -f ss.out
+rm -f hbm.[0-9]*.$dir_name
+if [ $ecn -ne 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_ecn=1
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+ cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
+ if [ "$cc" != "x" ] ; then
+ sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
+ fi
+fi
+
+if [ "$netem" -ne "0" ] ; then
+ if [ "$qdisc" != "" ] ; then
+ echo "WARNING: Ignoring -q options because -d option used"
+ fi
+ tc qdisc del dev lo root > /dev/null 2>&1
+ tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
+elif [ "$qdisc" != "" ] ; then
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+ tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
+fi
+
+n=0
+m=$[$dur * 5]
+hn="::1"
+if [ $use_netperf -ne 0 ] ; then
+ if [ "$server" != "" ] ; then
+ hn=$server
+ fi
+fi
+
+( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
+
+if [ $use_netperf -ne 0 ] ; then
+ begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
+ awk '{ print $1 }'`
+ if [ "$begNetserverPid" == "" ] ; then
+ if [ "$server" == "" ] ; then
+ ( ./netserver > /dev/null 2>&1) &
+ usleep 100000
+ fi
+ fi
+ flow_cnt=1
+ if [ "$server" == "" ] ; then
+ np_server=$host
+ else
+ np_server=$server
+ fi
+ if [ "$cc" == "x" ] ; then
+ np_cc=""
+ else
+ np_cc="-K $cc,$cc"
+ fi
+ replySize=1
+ while [ $flow_cnt -le $flows ] ; do
+ if [ $rr -ne 0 ] ; then
+ reqSize=1M
+ if [ $flow_cnt -eq 1 ] ; then
+ reqSize=10K
+ fi
+ if [ "$dir" == "-i" ] ; then
+ replySize=$reqSize
+ reqSize=1
+ fi
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ else
+ if [ "$dir" == "-i" ] ; then
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ else
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ fi
+ fi
+ flow_cnt=$[flow_cnt+1]
+ done
+
+# sleep for duration of test (plus some buffer)
+ n=$[dur+2]
+ sleep $n
+
+# force graceful termination of netperf
+ pids=`pgrep netperf`
+ for p in $pids ; do
+ kill -SIGALRM $p
+ done
+
+ flow_cnt=1
+ rate=0
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ fi
+ while [ $flow_cnt -le $flows ] ; do
+ if [ "$dir" == "-i" ] ; then
+ r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+ else
+ r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+ fi
+ echo "rate for flow $flow_cnt: $r"
+ rate=$[rate+r]
+ if [ $details -ne 0 ] ; then
+ echo "-----"
+ echo "Details for cgroup $id, flow $flow_cnt"
+ cat netperf.$id.$flow_cnt
+ fi
+ flow_cnt=$[flow_cnt+1]
+ done
+ if [ $details -ne 0 ] ; then
+ echo ""
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+elif [ $multi_iperf -eq 0 ] ; then
+ (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+ usleep 100000
+ iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
+ rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
+ rate=`echo $rates | grep -o "[0-9]*$"`
+
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+else
+ flow_cnt=1
+ while [ $flow_cnt -le $flows ] ; do
+ (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+ ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
+ port=$[port+1]
+ flow_cnt=$[flow_cnt+1]
+ done
+ n=$[dur+1]
+ sleep $n
+ flow_cnt=1
+ rate=0
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ fi
+
+ while [ $flow_cnt -le $flows ] ; do
+ r=`cat iperf3.$id.$flow_cnt`
+# echo "rate for flow $flow_cnt: $r"
+ if [ $details -ne 0 ] ; then
+ echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
+ fi
+ rate=$[rate+r]
+ flow_cnt=$[flow_cnt+1]
+ done
+ if [ $details -ne 0 ] ; then
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
+fi
+if [ $ecn -ne 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_ecn=0
+fi
+if [ "$netem" -ne "0" ] ; then
+ tc qdisc del dev lo root > /dev/null 2>&1
+fi
+if [ "$qdisc" != "" ] ; then
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+fi
+sleep 2
+
+hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
+if [ "$hbmPid" == "$hbm_pid" ] ; then
+ kill $hbm_pid
+fi
+
+sleep 1
+
+# Detach any pinned BPF programs that may have lingered
+rm -rf $BPFFS/hbm*
+
+if [ $use_netperf -ne 0 ] ; then
+ if [ "$server" == "" ] ; then
+ if [ "$begNetserverPid" == "" ] ; then
+ netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
+ if [ "$netserverPid" != "" ] ; then
+ kill $netserverPid
+ fi
+ fi
+ fi
+fi
+exit
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
new file mode 100644
index 000000000000..88a26f3ce201
--- /dev/null
+++ b/samples/bpf/fds_example.c
@@ -0,0 +1,195 @@
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf.h>
+
+#include <bpf/libbpf.h>
+#include "bpf_insn.h"
+#include "sock_example.h"
+#include "bpf_util.h"
+
+#define BPF_F_PIN (1 << 0)
+#define BPF_F_GET (1 << 1)
+#define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET)
+
+#define BPF_F_KEY (1 << 2)
+#define BPF_F_VAL (1 << 3)
+#define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL)
+
+#define BPF_M_UNSPEC 0
+#define BPF_M_MAP 1
+#define BPF_M_PROG 2
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static void usage(void)
+{
+ printf("Usage: fds_example [...]\n");
+ printf(" -F <file> File to pin/get object\n");
+ printf(" -P |- pin object\n");
+ printf(" -G `- get object\n");
+ printf(" -m eBPF map mode\n");
+ printf(" -k <key> |- map key\n");
+ printf(" -v <value> `- map value\n");
+ printf(" -p eBPF prog mode\n");
+ printf(" -o <object> `- object file\n");
+ printf(" -h Display this help.\n");
+}
+
+static int bpf_prog_create(const char *object)
+{
+ static struct bpf_insn insns[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = ARRAY_SIZE(insns);
+ struct bpf_object *obj;
+ int err;
+
+ if (object) {
+ obj = bpf_object__open_file(object, NULL);
+ assert(!libbpf_get_error(obj));
+ err = bpf_object__load(obj);
+ assert(!err);
+ return bpf_program__fd(bpf_object__next_program(obj, NULL));
+ } else {
+ LIBBPF_OPTS(bpf_prog_load_opts, opts,
+ .log_buf = bpf_log_buf,
+ .log_size = BPF_LOG_BUF_SIZE,
+ );
+
+ return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+ insns, insns_cnt, &opts);
+ }
+}
+
+static int bpf_do_map(const char *file, uint32_t flags, uint32_t key,
+ uint32_t value)
+{
+ int fd, ret;
+
+ if (flags & BPF_F_PIN) {
+ fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(uint32_t),
+ sizeof(uint32_t), 1024, NULL);
+ printf("bpf: map fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+
+ ret = bpf_obj_pin(fd, file);
+ printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
+ assert(ret == 0);
+ } else {
+ fd = bpf_obj_get(file);
+ printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+ }
+
+ if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) {
+ ret = bpf_map_update_elem(fd, &key, &value, 0);
+ printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value,
+ ret, strerror(errno));
+ assert(ret == 0);
+ } else if (flags & BPF_F_KEY) {
+ ret = bpf_map_lookup_elem(fd, &key, &value);
+ printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value,
+ ret, strerror(errno));
+ assert(ret == 0);
+ }
+
+ return 0;
+}
+
+static int bpf_do_prog(const char *file, uint32_t flags, const char *object)
+{
+ int fd, sock, ret;
+
+ if (flags & BPF_F_PIN) {
+ fd = bpf_prog_create(object);
+ printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+
+ ret = bpf_obj_pin(fd, file);
+ printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
+ assert(ret == 0);
+ } else {
+ fd = bpf_obj_get(file);
+ printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
+ assert(fd > 0);
+ }
+
+ sock = open_raw_sock("lo");
+ assert(sock > 0);
+
+ ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd));
+ printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd,
+ ret, strerror(errno));
+ assert(ret == 0);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const char *file = NULL, *object = NULL;
+ uint32_t key = 0, value = 0, flags = 0;
+ int opt, mode = BPF_M_UNSPEC;
+
+ while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) {
+ switch (opt) {
+ /* General args */
+ case 'F':
+ file = optarg;
+ break;
+ case 'P':
+ flags |= BPF_F_PIN;
+ break;
+ case 'G':
+ flags |= BPF_F_GET;
+ break;
+ /* Map-related args */
+ case 'm':
+ mode = BPF_M_MAP;
+ break;
+ case 'k':
+ key = strtoul(optarg, NULL, 0);
+ flags |= BPF_F_KEY;
+ break;
+ case 'v':
+ value = strtoul(optarg, NULL, 0);
+ flags |= BPF_F_VAL;
+ break;
+ /* Prog-related args */
+ case 'p':
+ mode = BPF_M_PROG;
+ break;
+ case 'o':
+ object = optarg;
+ break;
+ default:
+ goto out;
+ }
+ }
+
+ if (!(flags & BPF_F_PIN_GET) || !file)
+ goto out;
+
+ switch (mode) {
+ case BPF_M_MAP:
+ return bpf_do_map(file, flags, key, value);
+ case BPF_M_PROG:
+ return bpf_do_prog(file, flags, object);
+ }
+out:
+ usage();
+ return -1;
+}
diff --git a/samples/bpf/gnu/stubs.h b/samples/bpf/gnu/stubs.h
new file mode 100644
index 000000000000..1c638d9dce1a
--- /dev/null
+++ b/samples/bpf/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang --target=bpf' */
diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h
new file mode 100644
index 000000000000..38255812e376
--- /dev/null
+++ b/samples/bpf/hash_func01.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1
+ *
+ * Based on Paul Hsieh's (LGPG 2.1) hash function
+ * From: http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#define get16bits(d) (*((const __u16 *) (d)))
+
+static __always_inline
+__u32 SuperFastHash (const char *data, int len, __u32 initval) {
+ __u32 hash = initval;
+ __u32 tmp;
+ int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+#pragma clang loop unroll(full)
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (__u16);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= ((signed char)data[sizeof (__u16)]) << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += (signed char)*data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..fc88d4dbdf48
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Example program for Host Bandwidth Management
+ *
+ * This program loads a cgroup skb BPF program to enforce cgroup output
+ * (egress) or input (ingress) bandwidth limits.
+ *
+ * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
+ * Where:
+ * -d Print BPF trace debug buffer
+ * -l Also limit flows doing loopback
+ * -n <#> To create cgroup \"/hbm#\" and attach prog
+ * Default is /hbm1
+ * --no_cn Do not return cn notifications
+ * -r <rate> Rate limit in Mbps
+ * -s Get HBM stats (marked, dropped, etc.)
+ * -t <time> Exit after specified seconds (default is 0)
+ * -w Work conserving flag. cgroup can increase its bandwidth
+ * beyond the rate limit specified while there is available
+ * bandwidth. Current implementation assumes there is only
+ * NIC (eth0), but can be extended to support multiple NICs.
+ * Currently only supported for egress.
+ * -h Print this info
+ * prog BPF program file name. Name defaults to hbm_out_kern.o
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <linux/compiler.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <getopt.h>
+
+#include "cgroup_helpers.h"
+#include "hbm.h"
+#include "bpf_util.h"
+#include <bpf/libbpf.h>
+
+bool outFlag = true;
+int minRate = 1000; /* cgroup rate limit in Mbps */
+int rate = 1000; /* can grow if rate conserving is enabled */
+int dur = 1;
+bool stats_flag;
+bool loopback_flag;
+bool debugFlag;
+bool work_conserving_flag;
+bool no_cn_flag;
+bool edt_flag;
+
+static void Usage(void);
+static void read_trace_pipe2(void);
+static void do_error(char *msg, bool errno_flag);
+
+#define TRACEFS "/sys/kernel/tracing/"
+
+static struct bpf_program *bpf_prog;
+static struct bpf_object *obj;
+static int queue_stats_fd;
+
+static void read_trace_pipe2(void)
+{
+ int trace_fd;
+ FILE *outf;
+ char *outFname = "hbm_out.log";
+
+ trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0) {
+ printf("Error opening trace_pipe\n");
+ return;
+ }
+
+// Future support of ingress
+// if (!outFlag)
+// outFname = "hbm_in.log";
+ outf = fopen(outFname, "w");
+
+ if (outf == NULL)
+ printf("Error creating %s\n", outFname);
+
+ while (1) {
+ static char buf[4097];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf) - 1);
+ if (sz > 0) {
+ buf[sz] = 0;
+ puts(buf);
+ if (outf != NULL) {
+ fprintf(outf, "%s\n", buf);
+ fflush(outf);
+ }
+ }
+ }
+}
+
+static void do_error(char *msg, bool errno_flag)
+{
+ if (errno_flag)
+ printf("ERROR: %s, errno: %d\n", msg, errno);
+ else
+ printf("ERROR: %s\n", msg);
+ exit(1);
+}
+
+static int prog_load(char *prog)
+{
+ struct bpf_program *pos;
+ const char *sec_name;
+
+ obj = bpf_object__open_file(prog, NULL);
+ if (libbpf_get_error(obj)) {
+ printf("ERROR: opening BPF object file failed\n");
+ return 1;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ printf("ERROR: loading BPF object file failed\n");
+ goto err;
+ }
+
+ bpf_object__for_each_program(pos, obj) {
+ sec_name = bpf_program__section_name(pos);
+ if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) {
+ bpf_prog = pos;
+ break;
+ }
+ }
+ if (!bpf_prog) {
+ printf("ERROR: finding a prog in obj file failed\n");
+ goto err;
+ }
+
+ queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats");
+ if (queue_stats_fd < 0) {
+ printf("ERROR: finding a map in obj file failed\n");
+ goto err;
+ }
+
+ return 0;
+
+err:
+ bpf_object__close(obj);
+ return 1;
+}
+
+static int run_bpf_prog(char *prog, int cg_id)
+{
+ struct hbm_queue_stats qstats = {0};
+ char cg_dir[100], cg_pin_path[100];
+ struct bpf_link *link = NULL;
+ int key = 0;
+ int cg1 = 0;
+ int rc = 0;
+
+ sprintf(cg_dir, "/hbm%d", cg_id);
+ rc = prog_load(prog);
+ if (rc != 0)
+ return rc;
+
+ if (setup_cgroup_environment()) {
+ printf("ERROR: setting cgroup environment\n");
+ goto err;
+ }
+ cg1 = create_and_get_cgroup(cg_dir);
+ if (!cg1) {
+ printf("ERROR: create_and_get_cgroup\n");
+ goto err;
+ }
+ if (join_cgroup(cg_dir)) {
+ printf("ERROR: join_cgroup\n");
+ goto err;
+ }
+
+ qstats.rate = rate;
+ qstats.stats = stats_flag ? 1 : 0;
+ qstats.loopback = loopback_flag ? 1 : 0;
+ qstats.no_cn = no_cn_flag ? 1 : 0;
+ if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) {
+ printf("ERROR: Could not update map element\n");
+ goto err;
+ }
+
+ if (!outFlag)
+ bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS);
+
+ link = bpf_program__attach_cgroup(bpf_prog, cg1);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n");
+ goto err;
+ }
+
+ sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id);
+ rc = bpf_link__pin(link, cg_pin_path);
+ if (rc < 0) {
+ printf("ERROR: bpf_link__pin failed: %d\n", rc);
+ goto err;
+ }
+
+ if (work_conserving_flag) {
+ struct timeval t0, t_last, t_new;
+ FILE *fin;
+ unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
+ signed long long last_cg_tx_bytes, new_cg_tx_bytes;
+ signed long long delta_time, delta_bytes, delta_rate;
+ int delta_ms;
+#define DELTA_RATE_CHECK 10000 /* in us */
+#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
+
+ bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
+ if (gettimeofday(&t0, NULL) < 0)
+ do_error("gettimeofday failed", true);
+ t_last = t0;
+ fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
+ if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
+ do_error("fscanf fails", false);
+ fclose(fin);
+ last_cg_tx_bytes = qstats.bytes_total;
+ while (true) {
+ usleep(DELTA_RATE_CHECK);
+ if (gettimeofday(&t_new, NULL) < 0)
+ do_error("gettimeofday failed", true);
+ delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
+ (t_new.tv_usec - t0.tv_usec)/1000;
+ if (delta_ms > dur * 1000)
+ break;
+ delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
+ (t_new.tv_usec - t_last.tv_usec);
+ if (delta_time == 0)
+ continue;
+ t_last = t_new;
+ fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
+ "r");
+ if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
+ do_error("fscanf fails", false);
+ fclose(fin);
+ printf(" new_eth_tx_bytes:%llu\n",
+ new_eth_tx_bytes);
+ bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
+ new_cg_tx_bytes = qstats.bytes_total;
+ delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
+ last_eth_tx_bytes = new_eth_tx_bytes;
+ delta_rate = (delta_bytes * 8000000) / delta_time;
+ printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
+ delta_ms, delta_rate/1000000000.0,
+ rate/1000.0);
+ if (delta_rate < RATE_THRESHOLD) {
+ /* can increase cgroup rate limit, but first
+ * check if we are using the current limit.
+ * Currently increasing by 6.25%, unknown
+ * if that is the optimal rate.
+ */
+ int rate_diff100;
+
+ delta_bytes = new_cg_tx_bytes -
+ last_cg_tx_bytes;
+ last_cg_tx_bytes = new_cg_tx_bytes;
+ delta_rate = (delta_bytes * 8000000) /
+ delta_time;
+ printf(" rate:%.3fGbps",
+ delta_rate/1000000000.0);
+ rate_diff100 = (((long long)rate)*1000000 -
+ delta_rate) * 100 /
+ (((long long) rate) * 1000000);
+ printf(" rdiff:%d", rate_diff100);
+ if (rate_diff100 <= 3) {
+ rate += (rate >> 4);
+ if (rate > RATE_THRESHOLD / 1000000)
+ rate = RATE_THRESHOLD / 1000000;
+ qstats.rate = rate;
+ printf(" INC\n");
+ } else {
+ printf("\n");
+ }
+ } else {
+ /* Need to decrease cgroup rate limit.
+ * Currently decreasing by 12.5%, unknown
+ * if that is optimal
+ */
+ printf(" DEC\n");
+ rate -= (rate >> 3);
+ if (rate < minRate)
+ rate = minRate;
+ qstats.rate = rate;
+ }
+ if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY))
+ do_error("update map element fails", false);
+ }
+ } else {
+ sleep(dur);
+ }
+ // Get stats!
+ if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) {
+ char fname[100];
+ FILE *fout;
+
+ if (!outFlag)
+ sprintf(fname, "hbm.%d.in", cg_id);
+ else
+ sprintf(fname, "hbm.%d.out", cg_id);
+ fout = fopen(fname, "w");
+ fprintf(fout, "id:%d\n", cg_id);
+ fprintf(fout, "ERROR: Could not lookup queue_stats\n");
+ fclose(fout);
+ } else if (stats_flag && qstats.lastPacketTime >
+ qstats.firstPacketTime) {
+ long long delta_us = (qstats.lastPacketTime -
+ qstats.firstPacketTime)/1000;
+ unsigned int rate_mbps = ((qstats.bytes_total -
+ qstats.bytes_dropped) * 8 /
+ delta_us);
+ double percent_pkts, percent_bytes;
+ char fname[100];
+ FILE *fout;
+ int k;
+ static const char *returnValNames[] = {
+ "DROP_PKT",
+ "ALLOW_PKT",
+ "DROP_PKT_CWR",
+ "ALLOW_PKT_CWR"
+ };
+#define RET_VAL_COUNT 4
+
+// Future support of ingress
+// if (!outFlag)
+// sprintf(fname, "hbm.%d.in", cg_id);
+// else
+ sprintf(fname, "hbm.%d.out", cg_id);
+ fout = fopen(fname, "w");
+ fprintf(fout, "id:%d\n", cg_id);
+ fprintf(fout, "rate_mbps:%d\n", rate_mbps);
+ fprintf(fout, "duration:%.1f secs\n",
+ (qstats.lastPacketTime - qstats.firstPacketTime) /
+ 1000000000.0);
+ fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
+ fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
+ 1000000));
+ fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
+ fprintf(fout, "bytes_dropped_MB:%d\n",
+ (int)(qstats.bytes_dropped /
+ 1000000));
+ // Marked Pkts and Bytes
+ percent_pkts = (qstats.pkts_marked * 100.0) /
+ (qstats.pkts_total + 1);
+ percent_bytes = (qstats.bytes_marked * 100.0) /
+ (qstats.bytes_total + 1);
+ fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
+ fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
+
+ // Dropped Pkts and Bytes
+ percent_pkts = (qstats.pkts_dropped * 100.0) /
+ (qstats.pkts_total + 1);
+ percent_bytes = (qstats.bytes_dropped * 100.0) /
+ (qstats.bytes_total + 1);
+ fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
+ fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+
+ // ECN CE markings
+ percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
+ (int)qstats.pkts_ecn_ce);
+
+ // Average cwnd
+ fprintf(fout, "avg cwnd:%d\n",
+ (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
+ // Average rtt
+ fprintf(fout, "avg rtt:%d\n",
+ (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
+ // Average credit
+ if (edt_flag)
+ fprintf(fout, "avg credit_ms:%.03f\n",
+ (qstats.sum_credit /
+ (qstats.pkts_total + 1.0)) / 1000000.0);
+ else
+ fprintf(fout, "avg credit:%d\n",
+ (int)(qstats.sum_credit /
+ (1500 * ((int)qstats.pkts_total ) + 1)));
+
+ // Return values stats
+ for (k = 0; k < RET_VAL_COUNT; k++) {
+ percent_pkts = (qstats.returnValCount[k] * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
+ percent_pkts, (int)qstats.returnValCount[k]);
+ }
+ fclose(fout);
+ }
+
+ if (debugFlag)
+ read_trace_pipe2();
+ goto cleanup;
+
+err:
+ rc = 1;
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+
+ if (cg1 != -1)
+ close(cg1);
+
+ if (rc != 0)
+ cleanup_cgroup_environment();
+ return rc;
+}
+
+static void Usage(void)
+{
+ printf("This program loads a cgroup skb BPF program to enforce\n"
+ "cgroup output (egress) bandwidth limits.\n\n"
+ "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
+ " [-s] [-t <secs>] [-w] [-h] [prog]\n"
+ " Where:\n"
+ " -o indicates egress direction (default)\n"
+ " -d print BPF trace debug buffer\n"
+ " --edt use fq's Earliest Departure Time\n"
+ " -l also limit flows using loopback\n"
+ " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
+ " Default is /hbm1\n"
+ " --no_cn disable CN notifications\n"
+ " -r <rate> Rate in Mbps\n"
+ " -s Update HBM stats\n"
+ " -t <time> Exit after specified seconds (default is 0)\n"
+ " -w Work conserving flag. cgroup can increase\n"
+ " bandwidth beyond the rate limit specified\n"
+ " while there is available bandwidth. Current\n"
+ " implementation assumes there is only eth0\n"
+ " but can be extended to support multiple NICs\n"
+ " -h print this info\n"
+ " prog BPF program file name. Name defaults to\n"
+ " hbm_out_kern.o\n");
+}
+
+int main(int argc, char **argv)
+{
+ char *prog = "hbm_out_kern.o";
+ int k;
+ int cg_id = 1;
+ char *optstring = "iodln:r:st:wh";
+ struct option loptions[] = {
+ {"no_cn", 0, NULL, 1},
+ {"edt", 0, NULL, 2},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
+ switch (k) {
+ case 1:
+ no_cn_flag = true;
+ break;
+ case 2:
+ prog = "hbm_edt_kern.o";
+ edt_flag = true;
+ break;
+ case'o':
+ break;
+ case 'd':
+ debugFlag = true;
+ break;
+ case 'l':
+ loopback_flag = true;
+ break;
+ case 'n':
+ cg_id = atoi(optarg);
+ break;
+ case 'r':
+ minRate = atoi(optarg) * 1.024;
+ rate = minRate;
+ break;
+ case 's':
+ stats_flag = true;
+ break;
+ case 't':
+ dur = atoi(optarg);
+ break;
+ case 'w':
+ work_conserving_flag = true;
+ break;
+ case '?':
+ if (optopt == 'n' || optopt == 'r' || optopt == 't')
+ fprintf(stderr,
+ "Option -%c requires an argument.\n\n",
+ optopt);
+ case 'h':
+ default:
+ Usage();
+ return 0;
+ }
+ }
+
+ if (optind < argc)
+ prog = argv[optind];
+ printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
+
+ /* Use libbpf 1.0 API mode */
+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+ return run_bpf_prog(prog, cg_id);
+}
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..f0963ed6a562
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for Host Bandwidth Management (HBM) programs
+ */
+struct hbm_vqueue {
+ struct bpf_spin_lock lock;
+ /* 4 byte hole */
+ unsigned long long lasttime; /* In ns */
+ int credit; /* In bytes */
+ unsigned int rate; /* In bytes per NS << 20 */
+};
+
+struct hbm_queue_stats {
+ unsigned long rate; /* in Mbps*/
+ unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
+ loopback:1, /* also limit flows using loopback */
+ no_cn:1; /* do not use cn flags */
+ unsigned long long pkts_marked;
+ unsigned long long bytes_marked;
+ unsigned long long pkts_dropped;
+ unsigned long long bytes_dropped;
+ unsigned long long pkts_total;
+ unsigned long long bytes_total;
+ unsigned long long firstPacketTime;
+ unsigned long long lastPacketTime;
+ unsigned long long pkts_ecn_ce;
+ unsigned long long returnValCount[4];
+ unsigned long long sum_cwnd;
+ unsigned long long sum_rtt;
+ unsigned long long sum_cwnd_cnt;
+ long long sum_credit;
+};
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c
new file mode 100644
index 000000000000..6294f1d716c0
--- /dev/null
+++ b/samples/bpf/hbm_edt_kern.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * a user program, such as hbm, can access.
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ long long delta = 0, delta_send;
+ unsigned long long curtime, sendtime;
+ struct hbm_queue_stats *qsp = NULL;
+ unsigned int queue_index = 0;
+ bool congestion_flag = false;
+ bool ecn_ce_flag = false;
+ struct hbm_pkt_info pkti = {};
+ struct hbm_vqueue *qdp;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ int len = skb->len;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+
+ // Check if we should ignore loopback traffic
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ if (qdp->lasttime == 0)
+ hbm_init_edt_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ delta = qdp->lasttime - curtime;
+ // bound bursts to 100us
+ if (delta < -BURST_SIZE_NS) {
+ // negative delta is a credit that allows bursts
+ qdp->lasttime = curtime - BURST_SIZE_NS;
+ delta = -BURST_SIZE_NS;
+ }
+ sendtime = qdp->lasttime;
+ delta_send = BYTES_TO_NS(len, qdp->rate);
+ __sync_add_and_fetch(&(qdp->lasttime), delta_send);
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Set EDT of packet
+ skb->tstamp = sendtime;
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
+ qdp->rate = qsp->rate * 128;
+
+ // Set flags (drop, congestion, cwr)
+ // last packet will be sent in the future, bound latency
+ if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
+ len > LARGE_PKT_THRESH)) {
+ drop_flag = true;
+ if (pkti.is_tcp && pkti.ecn == 0)
+ cwr_flag = true;
+ } else if (delta > MARK_THRESH_NS) {
+ if (pkti.is_tcp)
+ congestion_flag = true;
+ else
+ drop_flag = true;
+ }
+
+ if (congestion_flag) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (delta >= MARK_THRESH_NS +
+ (rand % MARK_REGION_SIZE_NS)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ congestion_flag = false;
+ }
+ }
+ }
+
+ if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
+ drop_flag = false;
+ cwr_flag = true;
+ congestion_flag = false;
+ }
+
+ if (qsp != NULL && qsp->no_cn)
+ cwr_flag = false;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, (int) delta);
+
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
+ rv = DROP_PKT;
+ }
+
+ if (cwr_flag)
+ rv |= CWR;
+ return rv;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..1752a46a2b05
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for sample Host Bandwidth Manager (HBM) BPF programs
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <net/inet_ecn.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+#include "hbm.h"
+
+#define DROP_PKT 0
+#define ALLOW_PKT 1
+#define TCP_ECN_OK 1
+#define CWR 2
+
+#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging
+#undef bpf_printk
+#define bpf_printk(fmt, ...)
+#endif
+
+#define INITIAL_CREDIT_PACKETS 100
+#define MAX_BYTES_PER_PACKET 1500
+#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET)
+#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET)
+#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
+#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH)
+#define LARGE_PKT_THRESH 120
+#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
+#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+
+// Time base accounting for fq's EDT
+#define BURST_SIZE_NS 100000 // 100us
+#define MARK_THRESH_NS 50000 // 50us
+#define DROP_THRESH_NS 500000 // 500us
+// Reserve 20us of queuing for small packets (less than 120 bytes)
+#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000)
+#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS)
+
+// rate in bytes per ns << 20
+#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate))
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct hbm_vqueue);
+} queue_state SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct hbm_queue_stats);
+} queue_stats SEC(".maps");
+
+struct hbm_pkt_info {
+ int cwnd;
+ int rtt;
+ int packets_out;
+ bool is_ip;
+ bool is_tcp;
+ short ecn;
+};
+
+static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
+{
+ struct bpf_sock *sk;
+ struct bpf_tcp_sock *tp;
+
+ sk = skb->sk;
+ if (sk) {
+ sk = bpf_sk_fullsock(sk);
+ if (sk) {
+ if (sk->protocol == IPPROTO_TCP) {
+ tp = bpf_tcp_sock(sk);
+ if (tp) {
+ pkti->cwnd = tp->snd_cwnd;
+ pkti->rtt = tp->srtt_us >> 3;
+ pkti->packets_out = tp->packets_out;
+ return 0;
+ }
+ }
+ }
+ }
+ pkti->cwnd = 0;
+ pkti->rtt = 0;
+ pkti->packets_out = 0;
+ return 1;
+}
+
+static void hbm_get_pkt_info(struct __sk_buff *skb,
+ struct hbm_pkt_info *pkti)
+{
+ struct iphdr iph;
+ struct ipv6hdr *ip6h;
+
+ pkti->cwnd = 0;
+ pkti->rtt = 0;
+ bpf_skb_load_bytes(skb, 0, &iph, 12);
+ if (iph.version == 6) {
+ ip6h = (struct ipv6hdr *)&iph;
+ pkti->is_ip = true;
+ pkti->is_tcp = (ip6h->nexthdr == 6);
+ pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
+ } else if (iph.version == 4) {
+ pkti->is_ip = true;
+ pkti->is_tcp = (iph.protocol == 6);
+ pkti->ecn = iph.tos & INET_ECN_MASK;
+ } else {
+ pkti->is_ip = false;
+ pkti->is_tcp = false;
+ pkti->ecn = 0;
+ }
+ if (pkti->is_tcp)
+ get_tcp_info(skb, pkti);
+}
+
+static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
+{
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = bpf_ktime_get_ns();
+ qdp->credit = INIT_CREDIT;
+ qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp,
+ int rate)
+{
+ unsigned long long curtime;
+
+ curtime = bpf_ktime_get_ns();
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst
+ qdp->credit = 0; // not used
+ qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
+ int len,
+ unsigned long long curtime,
+ bool congestion_flag,
+ bool drop_flag,
+ bool cwr_flag,
+ bool ecn_ce_flag,
+ struct hbm_pkt_info *pkti,
+ int credit)
+{
+ int rv = ALLOW_PKT;
+
+ if (qsp != NULL) {
+ // Following is needed for work conserving
+ __sync_add_and_fetch(&(qsp->bytes_total), len);
+ if (qsp->stats) {
+ // Optionally update statistics
+ if (qsp->firstPacketTime == 0)
+ qsp->firstPacketTime = curtime;
+ qsp->lastPacketTime = curtime;
+ __sync_add_and_fetch(&(qsp->pkts_total), 1);
+ if (congestion_flag) {
+ __sync_add_and_fetch(&(qsp->pkts_marked), 1);
+ __sync_add_and_fetch(&(qsp->bytes_marked), len);
+ }
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qsp->pkts_dropped), 1);
+ __sync_add_and_fetch(&(qsp->bytes_dropped),
+ len);
+ }
+ if (ecn_ce_flag)
+ __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1);
+ if (pkti->cwnd) {
+ __sync_add_and_fetch(&(qsp->sum_cwnd),
+ pkti->cwnd);
+ __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1);
+ }
+ if (pkti->rtt)
+ __sync_add_and_fetch(&(qsp->sum_rtt),
+ pkti->rtt);
+ __sync_add_and_fetch(&(qsp->sum_credit), credit);
+
+ if (drop_flag)
+ rv = DROP_PKT;
+ if (cwr_flag)
+ rv |= 2;
+ if (rv == DROP_PKT)
+ __sync_add_and_fetch(&(qsp->returnValCount[0]),
+ 1);
+ else if (rv == ALLOW_PKT)
+ __sync_add_and_fetch(&(qsp->returnValCount[1]),
+ 1);
+ else if (rv == 2)
+ __sync_add_and_fetch(&(qsp->returnValCount[2]),
+ 1);
+ else if (rv == 3)
+ __sync_add_and_fetch(&(qsp->returnValCount[3]),
+ 1);
+ }
+ }
+}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..829934bd43cb
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ struct hbm_pkt_info pkti;
+ int len = skb->len;
+ unsigned int queue_index = 0;
+ unsigned long long curtime;
+ int credit;
+ signed long long delta = 0, new_credit;
+ int max_credit = MAX_CREDIT;
+ bool congestion_flag = false;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ bool ecn_ce_flag = false;
+ struct hbm_vqueue *qdp;
+ struct hbm_queue_stats *qsp = NULL;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ else if (qdp->lasttime == 0)
+ hbm_init_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ credit = qdp->credit;
+ delta = curtime - qdp->lasttime;
+ /* delta < 0 implies that another process with a curtime greater
+ * than ours beat us to the critical section and already added
+ * the new credit, so we should not add it ourselves
+ */
+ if (delta > 0) {
+ qdp->lasttime = curtime;
+ new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
+ if (new_credit > MAX_CREDIT)
+ credit = MAX_CREDIT;
+ else
+ credit = new_credit;
+ }
+ credit -= len;
+ qdp->credit = credit;
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+ qdp->rate = qsp->rate * 128;
+ bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+ (int)qdp->rate,
+ CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+ }
+
+ // Set flags (drop, congestion, cwr)
+ // Dropping => we are congested, so ignore congestion flag
+ if (credit < -DROP_THRESH ||
+ (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
+ // Very congested, set drop packet
+ drop_flag = true;
+ if (pkti.ecn)
+ congestion_flag = true;
+ else if (pkti.is_tcp)
+ cwr_flag = true;
+ } else if (credit < 0) {
+ // Congested, set congestion flag
+ if (pkti.ecn || pkti.is_tcp) {
+ if (credit < -MARK_THRESH)
+ congestion_flag = true;
+ else
+ congestion_flag = false;
+ } else {
+ congestion_flag = true;
+ }
+ }
+
+ if (congestion_flag) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (-credit >= MARK_THRESH +
+ (rand % MARK_REGION_SIZE)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ }
+ }
+ }
+
+ if (qsp != NULL)
+ if (qsp->no_cn)
+ cwr_flag = false;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, credit);
+
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qdp->credit), len);
+ rv = DROP_PKT;
+ }
+
+ if (cwr_flag)
+ rv |= 2;
+ return rv;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
new file mode 100644
index 000000000000..f07474c72525
--- /dev/null
+++ b/samples/bpf/ibumad_kern.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+/*
+ * ibumad BPF sample kernel side
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Copyright(c) 2018 Ira Weiny, Intel Corporation
+ */
+
+#define KBUILD_MODNAME "ibumad_count_pkts_by_class"
+#include <uapi/linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32); /* class; u32 required */
+ __type(value, u64); /* count of mads read */
+ __uint(max_entries, 256); /* Room for all Classes */
+} read_count SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32); /* class; u32 required */
+ __type(value, u64); /* count of mads written */
+ __uint(max_entries, 256); /* Room for all Classes */
+} write_count SEC(".maps");
+
+#undef DEBUG
+#ifndef DEBUG
+#undef bpf_printk
+#define bpf_printk(fmt, ...)
+#endif
+
+/* Taken from the current format defined in
+ * include/trace/events/ib_umad.h
+ * and
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_read/format
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_write/format
+ */
+struct ib_umad_rw_args {
+ u64 pad;
+ u8 port_num;
+ u8 sl;
+ u8 path_bits;
+ u8 grh_present;
+ u32 id;
+ u32 status;
+ u32 timeout_ms;
+ u32 retires;
+ u32 length;
+ u32 qpn;
+ u32 qkey;
+ u8 gid_index;
+ u8 hop_limit;
+ u16 lid;
+ u16 attr_id;
+ u16 pkey_index;
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ u32 flow_label;
+ u16 mad_status;
+ u16 class_specific;
+ u32 attr_mod;
+ u64 tid;
+ u8 gid[16];
+ u32 dev_index;
+ u8 traffic_class;
+};
+
+SEC("tracepoint/ib_umad/ib_umad_read_recv")
+int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx)
+{
+ u64 zero = 0, *val;
+ u8 class = ctx->mgmt_class;
+
+ bpf_printk("ib_umad read recv : class 0x%x\n", class);
+
+ val = bpf_map_lookup_elem(&read_count, &class);
+ if (!val) {
+ bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
+ val = bpf_map_lookup_elem(&read_count, &class);
+ if (!val)
+ return 0;
+ }
+
+ (*val) += 1;
+
+ return 0;
+}
+SEC("tracepoint/ib_umad/ib_umad_read_send")
+int on_ib_umad_read_send(struct ib_umad_rw_args *ctx)
+{
+ u64 zero = 0, *val;
+ u8 class = ctx->mgmt_class;
+
+ bpf_printk("ib_umad read send : class 0x%x\n", class);
+
+ val = bpf_map_lookup_elem(&read_count, &class);
+ if (!val) {
+ bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
+ val = bpf_map_lookup_elem(&read_count, &class);
+ if (!val)
+ return 0;
+ }
+
+ (*val) += 1;
+
+ return 0;
+}
+SEC("tracepoint/ib_umad/ib_umad_write")
+int on_ib_umad_write(struct ib_umad_rw_args *ctx)
+{
+ u64 zero = 0, *val;
+ u8 class = ctx->mgmt_class;
+
+ bpf_printk("ib_umad write : class 0x%x\n", class);
+
+ val = bpf_map_lookup_elem(&write_count, &class);
+ if (!val) {
+ bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST);
+ val = bpf_map_lookup_elem(&write_count, &class);
+ if (!val)
+ return 0;
+ }
+
+ (*val) += 1;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c
new file mode 100644
index 000000000000..d074c978aac7
--- /dev/null
+++ b/samples/bpf/ibumad_user.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+/*
+ * ibumad BPF sample user side
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Copyright(c) 2018 Ira Weiny, Intel Corporation
+ */
+
+#include <linux/bpf.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <limits.h>
+
+#include <getopt.h>
+#include <net/if.h>
+
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+#include <bpf/libbpf.h>
+
+static struct bpf_link *tp_links[3];
+static struct bpf_object *obj;
+static int map_fd[2];
+static int tp_cnt;
+
+static void dump_counts(int fd)
+{
+ __u32 key;
+ __u64 value;
+
+ for (key = 0; key < 256; key++) {
+ if (bpf_map_lookup_elem(fd, &key, &value)) {
+ printf("failed to read key %u\n", key);
+ continue;
+ }
+ if (value)
+ printf("0x%02x : %llu\n", key, value);
+ }
+}
+
+static void dump_all_counts(void)
+{
+ printf("Read 'Class : count'\n");
+ dump_counts(map_fd[0]);
+ printf("Write 'Class : count'\n");
+ dump_counts(map_fd[1]);
+}
+
+static void dump_exit(int sig)
+{
+ dump_all_counts();
+ /* Detach tracepoints */
+ while (tp_cnt)
+ bpf_link__destroy(tp_links[--tp_cnt]);
+
+ bpf_object__close(obj);
+ exit(0);
+}
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h'},
+ {"delay", required_argument, NULL, 'd'},
+};
+
+static void usage(char *cmd)
+{
+ printf("eBPF test program to count packets from various IP addresses\n"
+ "Usage: %s <options>\n"
+ " --help, -h this menu\n"
+ " --delay, -d <delay> wait <delay> sec between prints [1 - 1000000]\n"
+ , cmd
+ );
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_program *prog;
+ unsigned long delay = 5;
+ char filename[256];
+ int longindex = 0;
+ int opt, err = -1;
+
+ while ((opt = getopt_long(argc, argv, "hd:rSw",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ delay = strtoul(optarg, NULL, 0);
+ if (delay == ULONG_MAX || delay < 0 ||
+ delay > 1000000) {
+ fprintf(stderr, "ERROR: invalid delay : %s\n",
+ optarg);
+ usage(argv[0]);
+ return 1;
+ }
+ break;
+ default:
+ case 'h':
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ /* Do one final dump when exiting */
+ signal(SIGINT, dump_exit);
+ signal(SIGTERM, dump_exit);
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return err;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "read_count");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "write_count");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ tp_links[tp_cnt] = bpf_program__attach(prog);
+ if (libbpf_get_error(tp_links[tp_cnt])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ tp_links[tp_cnt] = NULL;
+ goto cleanup;
+ }
+ tp_cnt++;
+ }
+
+ while (1) {
+ sleep(delay);
+ dump_all_counts();
+ }
+ err = 0;
+
+cleanup:
+ /* Detach tracepoints */
+ while (tp_cnt)
+ bpf_link__destroy(tp_links[--tp_cnt]);
+
+ bpf_object__close(obj);
+ return err;
+}
diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c
new file mode 100644
index 000000000000..4adfcbbe6ef4
--- /dev/null
+++ b/samples/bpf/lathist_kern.c
@@ -0,0 +1,99 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define MAX_ENTRIES 20
+#define MAX_CPU 4
+
+/* We need to stick to static allocated memory (an array instead of
+ * hash table) because managing dynamic memory from the
+ * trace_preempt_[on|off] tracepoints hooks is not supported.
+ */
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, u64);
+ __uint(max_entries, MAX_CPU);
+} my_map SEC(".maps");
+
+SEC("kprobe/trace_preempt_off")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+ u64 *ts = bpf_map_lookup_elem(&my_map, &cpu);
+
+ if (ts)
+ *ts = bpf_ktime_get_ns();
+
+ return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, long);
+ __uint(max_entries, MAX_CPU * MAX_ENTRIES);
+} my_lat SEC(".maps");
+
+SEC("kprobe/trace_preempt_on")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u64 *ts, cur_ts, delta;
+ int key, cpu;
+ long *val;
+
+ cpu = bpf_get_smp_processor_id();
+ ts = bpf_map_lookup_elem(&my_map, &cpu);
+ if (!ts)
+ return 0;
+
+ cur_ts = bpf_ktime_get_ns();
+ delta = log2l(cur_ts - *ts);
+
+ if (delta > MAX_ENTRIES - 1)
+ delta = MAX_ENTRIES - 1;
+
+ key = cpu * MAX_ENTRIES + delta;
+ val = bpf_map_lookup_elem(&my_lat, &key);
+ if (val)
+ __sync_fetch_and_add((long *)val, 1);
+
+ return 0;
+
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
new file mode 100644
index 000000000000..7d8ff2418303
--- /dev/null
+++ b/samples/bpf/lathist_user.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#define MAX_ENTRIES 20
+#define MAX_CPU 4
+#define MAX_STARS 40
+
+struct cpu_hist {
+ long data[MAX_ENTRIES];
+ long max;
+};
+
+static struct cpu_hist cpu_hist[MAX_CPU];
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+static void print_hist(void)
+{
+ char starstr[MAX_STARS];
+ struct cpu_hist *hist;
+ int i, j;
+
+ /* clear screen */
+ printf("\033[2J");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ hist = &cpu_hist[j];
+
+ /* ignore CPUs without data (maybe offline?) */
+ if (hist->max == 0)
+ continue;
+
+ printf("CPU %d\n", j);
+ printf(" latency : count distribution\n");
+ for (i = 1; i <= MAX_ENTRIES; i++) {
+ stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1,
+ hist->data[i - 1], MAX_STARS, starstr);
+ }
+ }
+}
+
+static void get_data(int fd)
+{
+ long key, value;
+ int c, i;
+
+ for (i = 0; i < MAX_CPU; i++)
+ cpu_hist[i].max = 0;
+
+ for (c = 0; c < MAX_CPU; c++) {
+ for (i = 0; i < MAX_ENTRIES; i++) {
+ key = c * MAX_ENTRIES + i;
+ bpf_map_lookup_elem(fd, &key, &value);
+
+ cpu_hist[c].data[i] = value;
+ if (value > cpu_hist[c].max)
+ cpu_hist[c].max = value;
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int map_fd, i = 0;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
+ }
+
+ while (1) {
+ get_data(map_fd);
+ print_hist();
+ sleep(5);
+ }
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/lwt_len_hist.bpf.c b/samples/bpf/lwt_len_hist.bpf.c
new file mode 100644
index 000000000000..dbab80e813fe
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.bpf.c
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __type(key, u64);
+ __type(value, u64);
+ __uint(pinning, LIBBPF_PIN_BY_NAME);
+ __uint(max_entries, 1024);
+} lwt_len_hist_map SEC(".maps");
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+SEC("len_hist")
+int do_len_hist(struct __sk_buff *skb)
+{
+ __u64 *value, key, init_val = 1;
+
+ key = log2l(skb->len);
+
+ value = bpf_map_lookup_elem(&lwt_len_hist_map, &key);
+ if (value)
+ __sync_fetch_and_add(value, 1);
+ else
+ bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY);
+
+ return BPF_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
new file mode 100755
index 000000000000..381b2c634784
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NS1=lwt_ns1
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+BPF_PROG=lwt_len_hist.bpf.o
+TRACE_ROOT=/sys/kernel/tracing
+
+function cleanup {
+ # To reset saved histogram, remove pinned map
+ rm /sys/fs/bpf/tc/globals/lwt_len_hist_map
+ ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null
+ ip link del $VETH0 2> /dev/null
+ ip link del $VETH1 2> /dev/null
+ ip netns exec $NS1 killall netserver
+ ip netns delete $NS1 2> /dev/null
+}
+
+cleanup
+
+ip netns add $NS1
+ip link add $VETH0 type veth peer name $VETH1
+ip link set dev $VETH0 up
+ip addr add 192.168.253.1/24 dev $VETH0
+ip link set $VETH1 netns $NS1
+ip netns exec $NS1 ip link set dev $VETH1 up
+ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1
+ip netns exec $NS1 netserver
+
+echo 1 > ${TRACE_ROOT}/tracing_on
+cp /dev/null ${TRACE_ROOT}/trace
+ip route add 192.168.253.2/32 encap bpf out obj $BPF_PROG section len_hist dev $VETH0
+netperf -H 192.168.253.2 -t TCP_STREAM
+cat ${TRACE_ROOT}/trace | grep -v '^#'
+./lwt_len_hist
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+
+exit 0
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
new file mode 100644
index 000000000000..430a4b7e353e
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map";
+ uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {};
+ uint64_t key = 0, next_key, max_key = 0;
+ char starstr[MAX_STARS];
+ int i, map_fd;
+
+ map_fd = bpf_obj_get(map_filename);
+ if (map_fd < 0) {
+ fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+ map_filename, strerror(errno), errno);
+ return -1;
+ }
+
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ if (next_key >= MAX_INDEX) {
+ fprintf(stderr, "Key %lu out of bounds\n", next_key);
+ continue;
+ }
+
+ bpf_map_lookup_elem(map_fd, &next_key, values);
+
+ sum = 0;
+ for (i = 0; i < nr_cpus; i++)
+ sum += values[i];
+
+ data[next_key] = sum;
+ if (sum && next_key > max_key)
+ max_key = next_key;
+
+ if (sum > max_value)
+ max_value = sum;
+
+ key = next_key;
+ }
+
+ for (i = 1; i <= max_key + 1; i++) {
+ stars(starstr, data[i - 1], max_value, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+ MAX_STARS, starstr);
+ }
+
+ close(map_fd);
+
+ return 0;
+}
diff --git a/samples/bpf/map_perf_test.bpf.c b/samples/bpf/map_perf_test.bpf.c
new file mode 100644
index 000000000000..3cdeba2afe12
--- /dev/null
+++ b/samples/bpf/map_perf_test.bpf.c
@@ -0,0 +1,297 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include <errno.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define MAX_ENTRIES 1000
+#define MAX_NR_CPUS 1024
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, MAX_ENTRIES);
+} hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, 10000);
+} lru_hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, 10000);
+ __uint(map_flags, BPF_F_NO_COMMON_LRU);
+} nocommon_lru_hash_map SEC(".maps");
+
+struct inner_lru {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, MAX_ENTRIES);
+ __uint(map_flags, BPF_F_NUMA_NODE);
+ __uint(numa_node, 0);
+} inner_lru_hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, MAX_NR_CPUS);
+ __uint(key_size, sizeof(u32));
+ __array(values, struct inner_lru); /* use inner_lru as inner map */
+} array_of_lru_hashs SEC(".maps") = {
+ /* statically initialize the first element */
+ .values = { &inner_lru_hash_map },
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, MAX_ENTRIES);
+} percpu_hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, MAX_ENTRIES);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} hash_map_alloc SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, MAX_ENTRIES);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} percpu_hash_map_alloc SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LPM_TRIE);
+ __uint(key_size, 8);
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, 10000);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} lpm_trie_map_alloc SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, MAX_ENTRIES);
+} array_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, MAX_ENTRIES);
+} lru_hash_lookup_map SEC(".maps");
+
+SEC("ksyscall/getuid")
+int BPF_KSYSCALL(stress_hmap)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map, &key);
+ }
+
+ return 0;
+}
+
+SEC("ksyscall/geteuid")
+int BPF_KSYSCALL(stress_percpu_hmap)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map, &key);
+ }
+ return 0;
+}
+
+SEC("ksyscall/getgid")
+int BPF_KSYSCALL(stress_hmap_alloc)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&hash_map_alloc, &key);
+ }
+ return 0;
+}
+
+SEC("ksyscall/getegid")
+int BPF_KSYSCALL(stress_percpu_hmap_alloc)
+{
+ u32 key = bpf_get_current_pid_tgid();
+ long init_val = 1;
+ long *value;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY);
+ value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key);
+ if (value)
+ bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
+ }
+ return 0;
+}
+SEC("ksyscall/connect")
+int BPF_KSYSCALL(stress_lru_hmap_alloc, int fd, struct sockaddr_in *uservaddr,
+ int addrlen)
+{
+ char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn";
+ union {
+ u16 dst6[8];
+ struct {
+ u16 magic0;
+ u16 magic1;
+ u16 tcase;
+ u16 unused16;
+ u32 unused32;
+ u32 key;
+ };
+ } test_params;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)uservaddr;
+ u16 test_case;
+ long val = 1;
+ u32 key = 0;
+ int ret;
+
+ if (addrlen != sizeof(*in6))
+ return 0;
+
+ ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6),
+ &in6->sin6_addr);
+ if (ret)
+ goto done;
+
+ if (test_params.magic0 != 0xdead ||
+ test_params.magic1 != 0xbeef)
+ return 0;
+
+ test_case = test_params.tcase;
+ if (test_case != 3)
+ key = bpf_get_prandom_u32();
+
+ if (test_case == 0) {
+ ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
+ } else if (test_case == 1) {
+ ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val,
+ BPF_ANY);
+ } else if (test_case == 2) {
+ void *nolocal_lru_map;
+ int cpu = bpf_get_smp_processor_id();
+
+ nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs,
+ &cpu);
+ if (!nolocal_lru_map) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ ret = bpf_map_update_elem(nolocal_lru_map, &key, &val,
+ BPF_ANY);
+ } else if (test_case == 3) {
+ u32 i;
+
+ key = test_params.key;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 32; i++) {
+ bpf_map_lookup_elem(&lru_hash_lookup_map, &key);
+ key++;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+
+done:
+ if (ret)
+ bpf_trace_printk(fmt, sizeof(fmt), ret);
+
+ return 0;
+}
+
+SEC("ksyscall/gettid")
+int BPF_KSYSCALL(stress_lpm_trie_map_alloc)
+{
+ union {
+ u32 b32[2];
+ u8 b8[8];
+ } key;
+ unsigned int i;
+
+ key.b32[0] = 32;
+ key.b8[4] = 192;
+ key.b8[5] = 168;
+ key.b8[6] = 0;
+ key.b8[7] = 1;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 32; ++i)
+ bpf_map_lookup_elem(&lpm_trie_map_alloc, &key);
+
+ return 0;
+}
+
+SEC("ksyscall/getpgid")
+int BPF_KSYSCALL(stress_hash_map_lookup)
+{
+ u32 key = 1, i;
+ long *value;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 64; ++i)
+ value = bpf_map_lookup_elem(&hash_map, &key);
+
+ return 0;
+}
+
+SEC("ksyscall/getppid")
+int BPF_KSYSCALL(stress_array_map_lookup)
+{
+ u32 key = 1, i;
+ long *value;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < 64; ++i)
+ value = bpf_map_lookup_elem(&array_map, &key);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
new file mode 100644
index 000000000000..07ff471ed6ae
--- /dev/null
+++ b/samples/bpf/map_perf_test_user.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <asm/unistd.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <time.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define TEST_BIT(t) (1U << (t))
+#define MAX_NR_CPUS 1024
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+enum test_type {
+ HASH_PREALLOC,
+ PERCPU_HASH_PREALLOC,
+ HASH_KMALLOC,
+ PERCPU_HASH_KMALLOC,
+ LRU_HASH_PREALLOC,
+ NOCOMMON_LRU_HASH_PREALLOC,
+ LPM_KMALLOC,
+ HASH_LOOKUP,
+ ARRAY_LOOKUP,
+ INNER_LRU_HASH_PREALLOC,
+ LRU_HASH_LOOKUP,
+ NR_TESTS,
+};
+
+const char *test_map_names[NR_TESTS] = {
+ [HASH_PREALLOC] = "hash_map",
+ [PERCPU_HASH_PREALLOC] = "percpu_hash_map",
+ [HASH_KMALLOC] = "hash_map_alloc",
+ [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc",
+ [LRU_HASH_PREALLOC] = "lru_hash_map",
+ [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map",
+ [LPM_KMALLOC] = "lpm_trie_map_alloc",
+ [HASH_LOOKUP] = "hash_map",
+ [ARRAY_LOOKUP] = "array_map",
+ [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map",
+ [LRU_HASH_LOOKUP] = "lru_hash_lookup_map",
+};
+
+enum map_idx {
+ array_of_lru_hashs_idx,
+ hash_map_alloc_idx,
+ lru_hash_lookup_idx,
+ NR_IDXES,
+};
+
+static int map_fd[NR_IDXES];
+
+static int test_flags = ~0;
+static uint32_t num_map_entries;
+static uint32_t inner_lru_hash_size;
+static int lru_hash_lookup_test_entries = 32;
+static uint32_t max_cnt = 10000;
+
+static int check_test_flags(enum test_type t)
+{
+ return test_flags & TEST_BIT(t);
+}
+
+static void test_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getuid);
+ printf("%d:hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static int pre_test_lru_hash_lookup(int tasks)
+{
+ int fd = map_fd[lru_hash_lookup_idx];
+ uint32_t key;
+ long val = 1;
+ int ret;
+
+ if (num_map_entries > lru_hash_lookup_test_entries)
+ lru_hash_lookup_test_entries = num_map_entries;
+
+ /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test.
+ *
+ * It is fine that the user requests for a map with
+ * num_map_entries < 32 and some of the later lru hash lookup
+ * may return not found. For LRU map, we are not interested
+ * in such small map performance.
+ */
+ for (key = 0; key < lru_hash_lookup_test_entries; key++) {
+ ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void do_test_lru(enum test_type test, int cpu)
+{
+ static int inner_lru_map_fds[MAX_NR_CPUS];
+
+ struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
+ const char *test_name;
+ __u64 start_time;
+ int i, ret;
+
+ if (test == INNER_LRU_HASH_PREALLOC && cpu) {
+ /* If CPU is not 0, create inner_lru hash map and insert the fd
+ * value into the array_of_lru_hash map. In case of CPU 0,
+ * 'inner_lru_hash_map' was statically inserted on the map init
+ */
+ int outer_fd = map_fd[array_of_lru_hashs_idx];
+ unsigned int mycpu, mynode;
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = BPF_F_NUMA_NODE,
+ );
+
+ assert(cpu < MAX_NR_CPUS);
+
+ ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL);
+ assert(!ret);
+
+ opts.numa_node = mynode;
+ inner_lru_map_fds[cpu] =
+ bpf_map_create(BPF_MAP_TYPE_LRU_HASH,
+ test_map_names[INNER_LRU_HASH_PREALLOC],
+ sizeof(uint32_t),
+ sizeof(long),
+ inner_lru_hash_size, &opts);
+ if (inner_lru_map_fds[cpu] == -1) {
+ printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n",
+ strerror(errno), errno);
+ exit(1);
+ }
+
+ ret = bpf_map_update_elem(outer_fd, &cpu,
+ &inner_lru_map_fds[cpu],
+ BPF_ANY);
+ if (ret) {
+ printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n",
+ cpu, strerror(errno), errno);
+ exit(1);
+ }
+ }
+
+ in6.sin6_addr.s6_addr16[0] = 0xdead;
+ in6.sin6_addr.s6_addr16[1] = 0xbeef;
+
+ if (test == LRU_HASH_PREALLOC) {
+ test_name = "lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 0;
+ } else if (test == NOCOMMON_LRU_HASH_PREALLOC) {
+ test_name = "nocommon_lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 1;
+ } else if (test == INNER_LRU_HASH_PREALLOC) {
+ test_name = "inner_lru_hash_map_perf";
+ in6.sin6_addr.s6_addr16[2] = 2;
+ } else if (test == LRU_HASH_LOOKUP) {
+ test_name = "lru_hash_lookup_perf";
+ in6.sin6_addr.s6_addr16[2] = 3;
+ in6.sin6_addr.s6_addr32[3] = 0;
+ } else {
+ assert(0);
+ }
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++) {
+ ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6));
+ assert(ret == -1 && errno == EBADF);
+ if (in6.sin6_addr.s6_addr32[3] <
+ lru_hash_lookup_test_entries - 32)
+ in6.sin6_addr.s6_addr32[3] += 32;
+ else
+ in6.sin6_addr.s6_addr32[3] = 0;
+ }
+ printf("%d:%s pre-alloc %lld events per sec\n",
+ cpu, test_name,
+ max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_nocommon_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_inner_lru_hash_prealloc(int cpu)
+{
+ do_test_lru(INNER_LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_lru_hash_lookup(int cpu)
+{
+ do_test_lru(LRU_HASH_LOOKUP, cpu);
+}
+
+static void test_percpu_hash_prealloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_geteuid);
+ printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_hash_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getgid);
+ printf("%d:hash_map_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_percpu_hash_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getegid);
+ printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_lpm_kmalloc(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_gettid);
+ printf("%d:lpm_perf kmalloc %lld events per sec\n",
+ cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
+}
+
+static void test_hash_lookup(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getpgid, 0);
+ printf("%d:hash_lookup %lld lookups per sec\n",
+ cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
+}
+
+static void test_array_lookup(int cpu)
+{
+ __u64 start_time;
+ int i;
+
+ start_time = time_get_ns();
+ for (i = 0; i < max_cnt; i++)
+ syscall(__NR_getppid, 0);
+ printf("%d:array_lookup %lld lookups per sec\n",
+ cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
+}
+
+typedef int (*pre_test_func)(int tasks);
+const pre_test_func pre_test_funcs[] = {
+ [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup,
+};
+
+typedef void (*test_func)(int cpu);
+const test_func test_funcs[] = {
+ [HASH_PREALLOC] = test_hash_prealloc,
+ [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc,
+ [HASH_KMALLOC] = test_hash_kmalloc,
+ [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc,
+ [LRU_HASH_PREALLOC] = test_lru_hash_prealloc,
+ [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc,
+ [LPM_KMALLOC] = test_lpm_kmalloc,
+ [HASH_LOOKUP] = test_hash_lookup,
+ [ARRAY_LOOKUP] = test_array_lookup,
+ [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc,
+ [LRU_HASH_LOOKUP] = test_lru_hash_lookup,
+};
+
+static int pre_test(int tasks)
+{
+ int i;
+
+ for (i = 0; i < NR_TESTS; i++) {
+ if (pre_test_funcs[i] && check_test_flags(i)) {
+ int ret = pre_test_funcs[i](tasks);
+
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void loop(int cpu)
+{
+ cpu_set_t cpuset;
+ int i;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+ for (i = 0; i < NR_TESTS; i++) {
+ if (check_test_flags(i))
+ test_funcs[i](cpu);
+ }
+}
+
+static void run_perf_test(int tasks)
+{
+ pid_t pid[tasks];
+ int i;
+
+ assert(!pre_test(tasks));
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ loop(i);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void fill_lpm_trie(void)
+{
+ struct bpf_lpm_trie_key_u8 *key;
+ unsigned long value = 0;
+ unsigned int i;
+ int r;
+
+ key = alloca(sizeof(*key) + 4);
+ key->prefixlen = 32;
+
+ for (i = 0; i < 512; ++i) {
+ key->prefixlen = rand() % 33;
+ key->data[0] = rand() & 0xff;
+ key->data[1] = rand() & 0xff;
+ key->data[2] = rand() & 0xff;
+ key->data[3] = rand() & 0xff;
+ r = bpf_map_update_elem(map_fd[hash_map_alloc_idx],
+ key, &value, 0);
+ assert(!r);
+ }
+
+ key->prefixlen = 32;
+ key->data[0] = 192;
+ key->data[1] = 168;
+ key->data[2] = 0;
+ key->data[3] = 1;
+ value = 128;
+
+ r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0);
+ assert(!r);
+}
+
+static void fixup_map(struct bpf_object *obj)
+{
+ struct bpf_map *map;
+ int i;
+
+ bpf_object__for_each_map(map, obj) {
+ const char *name = bpf_map__name(map);
+
+ /* Only change the max_entries for the enabled test(s) */
+ for (i = 0; i < NR_TESTS; i++) {
+ if (!strcmp(test_map_names[i], name) &&
+ (check_test_flags(i))) {
+ bpf_map__set_max_entries(map, num_map_entries);
+ continue;
+ }
+ }
+ }
+
+ inner_lru_hash_size = num_map_entries;
+}
+
+int main(int argc, char **argv)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ struct bpf_link *links[8];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char filename[256];
+ int i = 0;
+
+ if (argc > 1)
+ test_flags = atoi(argv[1]) ? : test_flags;
+
+ if (argc > 2)
+ nr_cpus = atoi(argv[2]) ? : nr_cpus;
+
+ if (argc > 3)
+ num_map_entries = atoi(argv[3]);
+
+ if (argc > 4)
+ max_cnt = atoi(argv[4]);
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map");
+ if (libbpf_get_error(map)) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ inner_lru_hash_size = bpf_map__max_entries(map);
+ if (!inner_lru_hash_size) {
+ fprintf(stderr, "ERROR: failed to get map attribute\n");
+ goto cleanup;
+ }
+
+ /* resize BPF map prior to loading */
+ if (num_map_entries > 0)
+ fixup_map(obj);
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc");
+ map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map");
+ if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
+ }
+
+ fill_lpm_trie();
+
+ run_perf_test(nr_cpus);
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/net_shared.h b/samples/bpf/net_shared.h
new file mode 100644
index 000000000000..88cc52461c98
--- /dev/null
+++ b/samples/bpf/net_shared.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _NET_SHARED_H
+#define _NET_SHARED_H
+
+#define AF_INET 2
+#define AF_INET6 10
+
+#define ETH_ALEN 6
+#define ETH_P_802_3_MIN 0x0600
+#define ETH_P_8021Q 0x8100
+#define ETH_P_8021AD 0x88A8
+#define ETH_P_IP 0x0800
+#define ETH_P_IPV6 0x86DD
+#define ETH_P_ARP 0x0806
+#define IPPROTO_ICMPV6 58
+
+#define TC_ACT_OK 0
+#define TC_ACT_SHOT 2
+
+#define IFNAMSIZ 16
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+ __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define bpf_ntohs(x) __builtin_bswap16(x)
+#define bpf_htons(x) __builtin_bswap16(x)
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+ __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define bpf_ntohs(x) (x)
+#define bpf_htons(x) (x)
+#else
+# error "Endianness detection needs to be set up for your compiler?!"
+#endif
+
+#endif
diff --git a/samples/bpf/offwaketime.bpf.c b/samples/bpf/offwaketime.bpf.c
new file mode 100644
index 000000000000..4a65ba76c1b1
--- /dev/null
+++ b/samples/bpf/offwaketime.bpf.c
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+#define MINBLOCK_US 1
+#define MAX_ENTRIES 10000
+
+struct key_t {
+ char waker[TASK_COMM_LEN];
+ char target[TASK_COMM_LEN];
+ u32 wret;
+ u32 tret;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct key_t);
+ __type(value, u64);
+ __uint(max_entries, MAX_ENTRIES);
+} counts SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, MAX_ENTRIES);
+} start SEC(".maps");
+
+struct wokeby_t {
+ char name[TASK_COMM_LEN];
+ u32 ret;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, struct wokeby_t);
+ __uint(max_entries, MAX_ENTRIES);
+} wokeby SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, MAX_ENTRIES);
+} stackmap SEC(".maps");
+
+#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
+
+SEC("kprobe/try_to_wake_up")
+int waker(struct pt_regs *ctx)
+{
+ struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx);
+ u32 pid = BPF_CORE_READ(p, pid);
+ struct wokeby_t woke;
+
+ bpf_get_current_comm(&woke.name, sizeof(woke.name));
+ woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
+
+ bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY);
+ return 0;
+}
+
+static inline int update_counts(void *ctx, u32 pid, u64 delta)
+{
+ struct wokeby_t *woke;
+ u64 zero = 0, *val;
+ struct key_t key;
+
+ __builtin_memset(&key.waker, 0, sizeof(key.waker));
+ bpf_get_current_comm(&key.target, sizeof(key.target));
+ key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
+ key.wret = 0;
+
+ woke = bpf_map_lookup_elem(&wokeby, &pid);
+ if (woke) {
+ key.wret = woke->ret;
+ __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker));
+ bpf_map_delete_elem(&wokeby, &pid);
+ }
+
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (!val) {
+ bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST);
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (!val)
+ return 0;
+ }
+ (*val) += delta;
+ return 0;
+}
+
+#if 1
+/* taken from /sys/kernel/tracing/events/sched/sched_switch/format */
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct trace_event_raw_sched_switch *ctx)
+{
+ /* record previous thread sleep time */
+ u32 pid = ctx->prev_pid;
+#else
+SEC("kprobe.multi/finish_task_switch*")
+int oncpu(struct pt_regs *ctx)
+{
+ struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx);
+ /* record previous thread sleep time */
+ u32 pid = BPF_CORE_READ(p, pid);
+#endif
+ u64 delta, ts, *tsp;
+
+ ts = bpf_ktime_get_ns();
+ bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
+
+ /* calculate current thread's delta time */
+ pid = bpf_get_current_pid_tgid();
+ tsp = bpf_map_lookup_elem(&start, &pid);
+ if (!tsp)
+ /* missed start or filtered */
+ return 0;
+
+ delta = bpf_ktime_get_ns() - *tsp;
+ bpf_map_delete_elem(&start, &pid);
+ delta = delta / 1000;
+ if (delta < MINBLOCK_US)
+ return 0;
+
+ return update_counts(ctx, pid, delta);
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
new file mode 100644
index 000000000000..5557b5393642
--- /dev/null
+++ b/samples/bpf/offwaketime_user.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/perf_event.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "trace_helpers.h"
+
+#define PRINT_RAW_ADDR 0
+
+/* counts, stackmap */
+static int map_fd[2];
+
+static void print_ksym(__u64 addr)
+{
+ struct ksym *sym;
+
+ if (!addr)
+ return;
+ sym = ksym_search(addr);
+ if (!sym) {
+ printf("ksym not found. Is kallsyms loaded?\n");
+ return;
+ }
+
+ if (PRINT_RAW_ADDR)
+ printf("%s/%llx;", sym->name, addr);
+ else
+ printf("%s;", sym->name);
+}
+
+#define TASK_COMM_LEN 16
+
+struct key_t {
+ char waker[TASK_COMM_LEN];
+ char target[TASK_COMM_LEN];
+ __u32 wret;
+ __u32 tret;
+};
+
+static void print_stack(struct key_t *key, __u64 count)
+{
+ __u64 ip[PERF_MAX_STACK_DEPTH] = {};
+ static bool warned;
+ int i;
+
+ printf("%s;", key->target);
+ if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_ksym(ip[i]);
+ }
+ printf("-;");
+ if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
+ print_ksym(ip[i]);
+ }
+ printf(";%s %lld\n", key->waker, count);
+
+ if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) {
+ printf("stackmap collisions seen. Consider increasing size\n");
+ warned = true;
+ } else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) {
+ printf("err stackid %d %d\n", key->tret, key->wret);
+ }
+}
+
+static void print_stacks(int fd)
+{
+ struct key_t key = {}, next_key;
+ __u64 value;
+
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ print_stack(&next_key, value);
+ key = next_key;
+ }
+}
+
+static void int_exit(int sig)
+{
+ print_stacks(map_fd[0]);
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj = NULL;
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ int delay = 1, i = 0;
+ char filename[256];
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 2;
+ }
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
+ }
+
+ if (argc > 1)
+ delay = atoi(argv[1]);
+ sleep(delay);
+ print_stacks(map_fd[0]);
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c
new file mode 100644
index 000000000000..c6f65f90a097
--- /dev/null
+++ b/samples/bpf/parse_ldabs.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+SEC("ldabs")
+int handle_ingress(struct __sk_buff *skb)
+{
+ __u64 troff = ETH_HLEN + sizeof(struct iphdr);
+
+ if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP)
+ return 0;
+ if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP ||
+ load_byte(skb, ETH_HLEN) != 0x45)
+ return 0;
+ if (ip_is_fragment(skb, ETH_HLEN))
+ return 0;
+ if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT)
+ return TC_ACT_SHOT;
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c
new file mode 100644
index 000000000000..4a486cb1e0df
--- /dev/null
+++ b/samples/bpf/parse_simple.c
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include <net/ip.h>
+#include <bpf/bpf_helpers.h>
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+
+/* copy of 'struct ethhdr' without __packed */
+struct eth_hdr {
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ unsigned short h_proto;
+};
+
+SEC("simple")
+int handle_ingress(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ struct iphdr *iph = data + sizeof(*eth);
+ struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph);
+ void *data_end = (void *)(long)skb->data_end;
+
+ /* single length check */
+ if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end)
+ return 0;
+
+ if (eth->h_proto != htons(ETH_P_IP))
+ return 0;
+ if (iph->protocol != IPPROTO_UDP || iph->ihl != 5)
+ return 0;
+ if (ip_is_fragment(iph))
+ return 0;
+ if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT))
+ return TC_ACT_SHOT;
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c
new file mode 100644
index 000000000000..d8623846e810
--- /dev/null
+++ b/samples/bpf/parse_varlen.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <uapi/linux/bpf.h>
+#include <net/ip.h>
+#include <bpf/bpf_helpers.h>
+
+#define DEFAULT_PKTGEN_UDP_PORT 9
+#define DEBUG 0
+
+static int tcp(void *data, uint64_t tp_off, void *data_end)
+{
+ struct tcphdr *tcp = data + tp_off;
+
+ if (tcp + 1 > data_end)
+ return 0;
+ if (tcp->dest == htons(80) || tcp->source == htons(80))
+ return TC_ACT_SHOT;
+ return 0;
+}
+
+static int udp(void *data, uint64_t tp_off, void *data_end)
+{
+ struct udphdr *udp = data + tp_off;
+
+ if (udp + 1 > data_end)
+ return 0;
+ if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
+ udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) {
+ if (DEBUG) {
+ char fmt[] = "udp port 9 indeed\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ }
+ return TC_ACT_SHOT;
+ }
+ return 0;
+}
+
+static int parse_ipv4(void *data, uint64_t nh_off, void *data_end)
+{
+ struct iphdr *iph;
+ uint64_t ihl_len;
+
+ iph = data + nh_off;
+ if (iph + 1 > data_end)
+ return 0;
+
+ if (ip_is_fragment(iph))
+ return 0;
+ ihl_len = iph->ihl * 4;
+
+ if (iph->protocol == IPPROTO_IPIP) {
+ iph = data + nh_off + ihl_len;
+ if (iph + 1 > data_end)
+ return 0;
+ ihl_len += iph->ihl * 4;
+ }
+
+ if (iph->protocol == IPPROTO_TCP)
+ return tcp(data, nh_off + ihl_len, data_end);
+ else if (iph->protocol == IPPROTO_UDP)
+ return udp(data, nh_off + ihl_len, data_end);
+ return 0;
+}
+
+static int parse_ipv6(void *data, uint64_t nh_off, void *data_end)
+{
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ uint64_t ihl_len = sizeof(struct ipv6hdr);
+ uint64_t nexthdr;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end)
+ return 0;
+
+ nexthdr = ip6h->nexthdr;
+
+ if (nexthdr == IPPROTO_IPIP) {
+ iph = data + nh_off + ihl_len;
+ if (iph + 1 > data_end)
+ return 0;
+ ihl_len += iph->ihl * 4;
+ nexthdr = iph->protocol;
+ } else if (nexthdr == IPPROTO_IPV6) {
+ ip6h = data + nh_off + ihl_len;
+ if (ip6h + 1 > data_end)
+ return 0;
+ ihl_len += sizeof(struct ipv6hdr);
+ nexthdr = ip6h->nexthdr;
+ }
+
+ if (nexthdr == IPPROTO_TCP)
+ return tcp(data, nh_off + ihl_len, data_end);
+ else if (nexthdr == IPPROTO_UDP)
+ return udp(data, nh_off + ihl_len, data_end);
+ return 0;
+}
+
+SEC("varlen")
+int handle_ingress(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ uint64_t h_proto, nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return 0;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return 0;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return 0;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_IP))
+ return parse_ipv4(data, nh_off, data_end);
+ else if (h_proto == htons(ETH_P_IPV6))
+ return parse_ipv6(data, nh_off, data_end);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh
new file mode 100755
index 000000000000..fc6bc0451ab4
--- /dev/null
+++ b/samples/bpf/run_cookie_uid_helper_example.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+local_dir="$(pwd)"
+root_dir=$local_dir/../..
+mnt_dir=$(mktemp -d --tmp)
+
+on_exit() {
+ iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT
+ umount ${mnt_dir}
+ rm -r ${mnt_dir}
+}
+
+trap on_exit EXIT
+mount -t bpf bpf ${mnt_dir}
+./per_socket_stats_example ${mnt_dir}/bpf_prog $1
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
new file mode 100644
index 000000000000..a3f8a3998e0a
--- /dev/null
+++ b/samples/bpf/sampleip_kern.c
@@ -0,0 +1,38 @@
+/* Copyright 2016 Netflix, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_IPS 8192
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u64);
+ __type(value, u32);
+ __uint(max_entries, MAX_IPS);
+} ip_map SEC(".maps");
+
+SEC("perf_event")
+int do_sample(struct bpf_perf_event_data *ctx)
+{
+ u64 ip;
+ u32 *value, init_val = 1;
+
+ ip = PT_REGS_IP(&ctx->regs);
+ value = bpf_map_lookup_elem(&ip_map, &ip);
+ if (value)
+ *value += 1;
+ else
+ /* E2BIG not tested for this example only */
+ bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
new file mode 100644
index 000000000000..9283f47844fb
--- /dev/null
+++ b/samples/bpf/sampleip_user.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sampleip: sample instruction pointer and frequency count in a BPF map.
+ *
+ * Copyright 2016 Netflix, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define DEFAULT_FREQ 99
+#define DEFAULT_SECS 5
+#define MAX_IPS 8192
+
+static int map_fd;
+static int nr_cpus;
+static long _text_addr;
+
+static void usage(void)
+{
+ printf("USAGE: sampleip [-F freq] [duration]\n");
+ printf(" -F freq # sample frequency (Hertz), default 99\n");
+ printf(" duration # sampling duration (seconds), default 5\n");
+}
+
+static int sampling_start(int freq, struct bpf_program *prog,
+ struct bpf_link *links[])
+{
+ int i, pmu_fd;
+
+ struct perf_event_attr pe_sample_attr = {
+ .type = PERF_TYPE_SOFTWARE,
+ .freq = 1,
+ .sample_period = freq,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ .inherit = 1,
+ };
+
+ for (i = 0; i < nr_cpus; i++) {
+ pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i,
+ -1 /* group_fd */, 0 /* flags */);
+ if (pmu_fd < 0) {
+ fprintf(stderr, "ERROR: Initializing perf sampling\n");
+ return 1;
+ }
+ links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: Attach perf event\n");
+ links[i] = NULL;
+ close(pmu_fd);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void sampling_end(struct bpf_link *links[])
+{
+ int i;
+
+ for (i = 0; i < nr_cpus; i++)
+ bpf_link__destroy(links[i]);
+}
+
+struct ipcount {
+ __u64 ip;
+ __u32 count;
+};
+
+/* used for sorting */
+struct ipcount counts[MAX_IPS];
+
+static int count_cmp(const void *p1, const void *p2)
+{
+ return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count;
+}
+
+static void print_ip_map(int fd)
+{
+ struct ksym *sym;
+ __u64 key, next_key;
+ __u32 value;
+ int i, max;
+
+ printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT");
+
+ /* fetch IPs and counts */
+ key = 0, i = 0;
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ counts[i].ip = next_key;
+ counts[i++].count = value;
+ key = next_key;
+ }
+ max = i;
+
+ /* sort and print */
+ qsort(counts, max, sizeof(struct ipcount), count_cmp);
+ for (i = 0; i < max; i++) {
+ if (counts[i].ip > _text_addr) {
+ sym = ksym_search(counts[i].ip);
+ if (!sym) {
+ printf("ksym not found. Is kallsyms loaded?\n");
+ continue;
+ }
+
+ printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name,
+ counts[i].count);
+ } else {
+ printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)",
+ counts[i].count);
+ }
+ }
+
+ if (max == MAX_IPS) {
+ printf("WARNING: IP hash was full (max %d entries); ", max);
+ printf("may have dropped samples\n");
+ }
+}
+
+static void int_exit(int sig)
+{
+ printf("\n");
+ print_ip_map(map_fd);
+ exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1;
+ struct bpf_object *obj = NULL;
+ struct bpf_program *prog;
+ struct bpf_link **links;
+ char filename[256];
+
+ /* process arguments */
+ while ((opt = getopt(argc, argv, "F:h")) != -1) {
+ switch (opt) {
+ case 'F':
+ freq = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage();
+ return 0;
+ }
+ }
+ if (argc - optind == 1)
+ secs = atoi(argv[optind]);
+ if (freq == 0 || secs == 0) {
+ usage();
+ return 1;
+ }
+
+ /* initialize kernel symbol translation */
+ if (load_kallsyms()) {
+ fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
+ return 2;
+ }
+
+ /* used to determine whether the address is kernel space */
+ _text_addr = ksym_get_addr("_text");
+ if (!_text_addr) {
+ fprintf(stderr, "ERROR: no '_text' in /proc/kallsyms\n");
+ return 3;
+ }
+
+ /* create perf FDs for each CPU */
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ links = calloc(nr_cpus, sizeof(struct bpf_link *));
+ if (!links) {
+ fprintf(stderr, "ERROR: malloc of links\n");
+ goto cleanup;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "do_sample");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* do sampling */
+ printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
+ freq, secs);
+ if (sampling_start(freq, prog, links) != 0)
+ goto cleanup;
+
+ sleep(secs);
+ error = 0;
+
+cleanup:
+ sampling_end(links);
+ /* output sample counts */
+ if (!error)
+ print_ip_map(map_fd);
+
+ free(links);
+ bpf_object__close(obj);
+ return error;
+}
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
new file mode 100644
index 000000000000..5b66f2401b96
--- /dev/null
+++ b/samples/bpf/sock_example.c
@@ -0,0 +1,111 @@
+/* eBPF example program:
+ * - creates arraymap in kernel with key 4 bytes and value 8 bytes
+ *
+ * - loads eBPF program:
+ * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
+ * *(u32*)(fp - 4) = r0;
+ * // assuming packet is IPv4, lookup ip->proto in a map
+ * value = bpf_map_lookup_elem(map_fd, fp - 4);
+ * if (value)
+ * (*(u64*)value) += 1;
+ *
+ * - attaches this program to loopback interface "lo" raw socket
+ *
+ * - every second user space reads map[tcp], map[udp], map[icmp] to see
+ * how many packets of given protocol were seen on "lo"
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <stddef.h>
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
+#include "sock_example.h"
+#include "bpf_util.h"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int test_sock(void)
+{
+ int sock = -1, map_fd, prog_fd, i, key;
+ long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
+
+ map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value),
+ 256, NULL);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = ARRAY_SIZE(prog);
+ LIBBPF_OPTS(bpf_prog_load_opts, opts,
+ .log_buf = bpf_log_buf,
+ .log_size = BPF_LOG_BUF_SIZE,
+ );
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+ prog, insns_cnt, &opts);
+ if (prog_fd < 0) {
+ printf("failed to load prog '%s'\n", strerror(errno));
+ goto cleanup;
+ }
+
+ sock = open_raw_sock("lo");
+
+ if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) < 0) {
+ printf("setsockopt %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ for (i = 0; i < 10; i++) {
+ key = IPPROTO_TCP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+
+ key = IPPROTO_UDP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+
+ key = IPPROTO_ICMP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
+
+ printf("TCP %lld UDP %lld ICMP %lld packets\n",
+ tcp_cnt, udp_cnt, icmp_cnt);
+ sleep(1);
+ }
+
+cleanup:
+ /* maps, programs, raw sockets will auto cleanup on process exit */
+ return 0;
+}
+
+int main(void)
+{
+ FILE *f;
+
+ f = popen("ping -4 -c5 localhost", "r");
+ (void)f;
+
+ return test_sock();
+}
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
new file mode 100644
index 000000000000..a27d7579bc73
--- /dev/null
+++ b/samples/bpf/sock_example.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/unistd.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+static inline int open_raw_sock(const char *name)
+{
+ struct sockaddr_ll sll;
+ int sock;
+
+ sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
+ if (sock < 0) {
+ printf("cannot create raw socket\n");
+ return -1;
+ }
+
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = if_nametoindex(name);
+ sll.sll_protocol = htons(ETH_P_ALL);
+ if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
+ printf("bind to %s: %s\n", name, strerror(errno));
+ close(sock);
+ return -1;
+ }
+
+ return sock;
+}
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
new file mode 100644
index 000000000000..431c956460ad
--- /dev/null
+++ b/samples/bpf/sockex1_kern.c
@@ -0,0 +1,30 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, 256);
+} my_map SEC(".maps");
+
+SEC("socket1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
+ long *value;
+
+ if (skb->pkt_type != PACKET_OUTGOING)
+ return 0;
+
+ value = bpf_map_lookup_elem(&my_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, skb->len);
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
new file mode 100644
index 000000000000..9e8d39e245c1
--- /dev/null
+++ b/samples/bpf/sockex1_user.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+int main(int ac, char **argv)
+{
+ struct bpf_object *obj;
+ struct bpf_program *prog;
+ int map_fd, prog_fd;
+ char filename[256];
+ int i, sock, err;
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj))
+ return 1;
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER);
+
+ err = bpf_object__load(obj);
+ if (err)
+ return 1;
+
+ prog_fd = bpf_program__fd(prog);
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+
+ sock = open_raw_sock("lo");
+
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
+
+ f = popen("ping -4 -c5 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ long long tcp_cnt, udp_cnt, icmp_cnt;
+ int key;
+
+ key = IPPROTO_TCP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+
+ key = IPPROTO_UDP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+
+ key = IPPROTO_ICMP;
+ assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
+
+ printf("TCP %lld UDP %lld ICMP %lld bytes\n",
+ tcp_cnt, udp_cnt, icmp_cnt);
+ sleep(1);
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
new file mode 100644
index 000000000000..f93d9145ab8a
--- /dev/null
+++ b/samples/bpf/sockex2_kern.c
@@ -0,0 +1,222 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u16 thoff;
+ __u8 ip_proto;
+};
+
+static inline int proto_ports_offset(__u64 proto)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ return 0;
+ case IPPROTO_AH:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+ __u64 w0 = load_word(ctx, off);
+ __u64 w1 = load_word(ctx, off + 4);
+ __u64 w2 = load_word(ctx, off + 8);
+ __u64 w3 = load_word(ctx, off + 12);
+
+ return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+ struct flow_key_record *flow)
+{
+ __u64 verlen;
+
+ if (unlikely(ip_is_fragment(skb, nhoff)))
+ *ip_proto = 0;
+ else
+ *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+ if (*ip_proto != IPPROTO_GRE) {
+ flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+ flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+ }
+
+ verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+ if (likely(verlen == 0x45))
+ nhoff += 20;
+ else
+ nhoff += (verlen & 0xF) << 2;
+
+ return nhoff;
+}
+
+static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
+ struct flow_key_record *flow)
+{
+ *ip_proto = load_byte(skb,
+ nhoff + offsetof(struct ipv6hdr, nexthdr));
+ flow->src = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, saddr));
+ flow->dst = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, daddr));
+ nhoff += sizeof(struct ipv6hdr);
+
+ return nhoff;
+}
+
+static inline bool flow_dissector(struct __sk_buff *skb,
+ struct flow_key_record *flow)
+{
+ __u64 nhoff = ETH_HLEN;
+ __u64 ip_proto;
+ __u64 proto = load_half(skb, 12);
+ int poff;
+
+ if (proto == ETH_P_8021AD) {
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (proto == ETH_P_8021Q) {
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (likely(proto == ETH_P_IP))
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ else if (proto == ETH_P_IPV6)
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ else
+ return false;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ };
+
+ __u64 gre_flags = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, flags));
+ __u64 gre_proto = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, proto));
+
+ if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+ break;
+
+ proto = gre_proto;
+ nhoff += 4;
+ if (gre_flags & GRE_CSUM)
+ nhoff += 4;
+ if (gre_flags & GRE_KEY)
+ nhoff += 4;
+ if (gre_flags & GRE_SEQ)
+ nhoff += 4;
+
+ if (proto == ETH_P_8021Q) {
+ proto = load_half(skb,
+ nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ }
+
+ if (proto == ETH_P_IP)
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ else if (proto == ETH_P_IPV6)
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ else
+ return false;
+ break;
+ }
+ case IPPROTO_IPIP:
+ nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
+ break;
+ case IPPROTO_IPV6:
+ nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
+ break;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ poff = proto_ports_offset(ip_proto);
+ if (poff >= 0) {
+ nhoff += poff;
+ flow->ports = load_word(skb, nhoff);
+ }
+
+ flow->thoff = (__u16) nhoff;
+
+ return true;
+}
+
+struct pair {
+ long packets;
+ long bytes;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __be32);
+ __type(value, struct pair);
+ __uint(max_entries, 1024);
+} hash_map SEC(".maps");
+
+SEC("socket2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+ struct flow_key_record flow = {};
+ struct pair *value;
+ u32 key;
+
+ if (!flow_dissector(skb, &flow))
+ return 0;
+
+ key = flow.dst;
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value) {
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, skb->len);
+ } else {
+ struct pair val = {1, skb->len};
+
+ bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+ }
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
new file mode 100644
index 000000000000..2c18471336f0
--- /dev/null
+++ b/samples/bpf/sockex2_user.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+int main(int ac, char **argv)
+{
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int map_fd, prog_fd;
+ char filename[256];
+ int i, sock, err;
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj))
+ return 1;
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER);
+
+ err = bpf_object__load(obj);
+ if (err)
+ return 1;
+
+ prog_fd = bpf_program__fd(prog);
+ map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
+
+ sock = open_raw_sock("lo");
+
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
+
+ f = popen("ping -4 -c5 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ int key = 0, next_key;
+ struct pair value;
+
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
+ printf("ip %s bytes %lld packets %lld\n",
+ inet_ntoa((struct in_addr){htonl(next_key)}),
+ value.bytes, value.packets);
+ key = next_key;
+ }
+ sleep(1);
+ }
+ return 0;
+}
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
new file mode 100644
index 000000000000..822c13242251
--- /dev/null
+++ b/samples/bpf/sockex3_kern.c
@@ -0,0 +1,304 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/if.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/if_tunnel.h>
+#include <uapi/linux/mpls.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+
+#define PARSE_VLAN 1
+#define PARSE_MPLS 2
+#define PARSE_IP 3
+#define PARSE_IPV6 4
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u32 ip_proto;
+};
+
+static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto);
+
+static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
+{
+ return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
+ & (IP_MF | IP_OFFSET);
+}
+
+static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
+{
+ __u64 w0 = load_word(ctx, off);
+ __u64 w1 = load_word(ctx, off + 4);
+ __u64 w2 = load_word(ctx, off + 8);
+ __u64 w3 = load_word(ctx, off + 12);
+
+ return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+struct globals {
+ struct flow_key_record flow;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, struct globals);
+ __uint(max_entries, 32);
+} percpu_map SEC(".maps");
+
+/* user poor man's per_cpu until native support is ready */
+static struct globals *this_cpu_globals(void)
+{
+ u32 key = bpf_get_smp_processor_id();
+
+ return bpf_map_lookup_elem(&percpu_map, &key);
+}
+
+/* some simple stats for user space consumption */
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct flow_key_record);
+ __type(value, struct pair);
+ __uint(max_entries, 1024);
+} hash_map SEC(".maps");
+
+static void update_stats(struct __sk_buff *skb, struct globals *g)
+{
+ struct flow_key_record key = g->flow;
+ struct pair *value;
+
+ value = bpf_map_lookup_elem(&hash_map, &key);
+ if (value) {
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, skb->len);
+ } else {
+ struct pair val = {1, skb->len};
+
+ bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
+ }
+}
+
+static __always_inline void parse_ip_proto(struct __sk_buff *skb,
+ struct globals *g, __u32 ip_proto)
+{
+ __u32 nhoff = skb->cb[0];
+ int poff;
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ };
+
+ __u32 gre_flags = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, flags));
+ __u32 gre_proto = load_half(skb,
+ nhoff + offsetof(struct gre_hdr, proto));
+
+ if (gre_flags & (GRE_VERSION|GRE_ROUTING))
+ break;
+
+ nhoff += 4;
+ if (gre_flags & GRE_CSUM)
+ nhoff += 4;
+ if (gre_flags & GRE_KEY)
+ nhoff += 4;
+ if (gre_flags & GRE_SEQ)
+ nhoff += 4;
+
+ skb->cb[0] = nhoff;
+ parse_eth_proto(skb, gre_proto);
+ break;
+ }
+ case IPPROTO_IPIP:
+ parse_eth_proto(skb, ETH_P_IP);
+ break;
+ case IPPROTO_IPV6:
+ parse_eth_proto(skb, ETH_P_IPV6);
+ break;
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ g->flow.ports = load_word(skb, nhoff);
+ case IPPROTO_ICMP:
+ g->flow.ip_proto = ip_proto;
+ update_stats(skb, g);
+ break;
+ default:
+ break;
+ }
+}
+
+SEC("socket")
+int bpf_func_ip(struct __sk_buff *skb)
+{
+ struct globals *g = this_cpu_globals();
+ __u32 nhoff, verlen, ip_proto;
+
+ if (!g)
+ return 0;
+
+ nhoff = skb->cb[0];
+
+ if (unlikely(ip_is_fragment(skb, nhoff)))
+ return 0;
+
+ ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
+
+ if (ip_proto != IPPROTO_GRE) {
+ g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
+ g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
+ }
+
+ verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
+ nhoff += (verlen & 0xF) << 2;
+
+ skb->cb[0] = nhoff;
+ parse_ip_proto(skb, g, ip_proto);
+ return 0;
+}
+
+SEC("socket")
+int bpf_func_ipv6(struct __sk_buff *skb)
+{
+ struct globals *g = this_cpu_globals();
+ __u32 nhoff, ip_proto;
+
+ if (!g)
+ return 0;
+
+ nhoff = skb->cb[0];
+
+ ip_proto = load_byte(skb,
+ nhoff + offsetof(struct ipv6hdr, nexthdr));
+ g->flow.src = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, saddr));
+ g->flow.dst = ipv6_addr_hash(skb,
+ nhoff + offsetof(struct ipv6hdr, daddr));
+ nhoff += sizeof(struct ipv6hdr);
+
+ skb->cb[0] = nhoff;
+ parse_ip_proto(skb, g, ip_proto);
+ return 0;
+}
+
+SEC("socket")
+int bpf_func_vlan(struct __sk_buff *skb)
+{
+ __u32 nhoff, proto;
+
+ nhoff = skb->cb[0];
+
+ proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
+ h_vlan_encapsulated_proto));
+ nhoff += sizeof(struct vlan_hdr);
+ skb->cb[0] = nhoff;
+
+ parse_eth_proto(skb, proto);
+
+ return 0;
+}
+
+SEC("socket")
+int bpf_func_mpls(struct __sk_buff *skb)
+{
+ __u32 nhoff, label;
+
+ nhoff = skb->cb[0];
+
+ label = load_word(skb, nhoff);
+ nhoff += sizeof(struct mpls_label);
+ skb->cb[0] = nhoff;
+
+ if (label & MPLS_LS_S_MASK) {
+ __u8 verlen = load_byte(skb, nhoff);
+ if ((verlen & 0xF0) == 4)
+ parse_eth_proto(skb, ETH_P_IP);
+ else
+ parse_eth_proto(skb, ETH_P_IPV6);
+ } else {
+ parse_eth_proto(skb, ETH_P_MPLS_UC);
+ }
+
+ return 0;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(max_entries, 8);
+ __array(values, u32 (void *));
+} prog_array_init SEC(".maps") = {
+ .values = {
+ [PARSE_VLAN] = (void *)&bpf_func_vlan,
+ [PARSE_IP] = (void *)&bpf_func_ip,
+ [PARSE_IPV6] = (void *)&bpf_func_ipv6,
+ [PARSE_MPLS] = (void *)&bpf_func_mpls,
+ },
+};
+
+/* Protocol dispatch routine. It tail-calls next BPF program depending
+ * on eth proto. Note, we could have used ...
+ *
+ * bpf_tail_call(skb, &prog_array_init, proto);
+ *
+ * ... but it would need large prog_array and cannot be optimised given
+ * the map key is not static.
+ */
+static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
+{
+ switch (proto) {
+ case ETH_P_8021Q:
+ case ETH_P_8021AD:
+ bpf_tail_call(skb, &prog_array_init, PARSE_VLAN);
+ break;
+ case ETH_P_MPLS_UC:
+ case ETH_P_MPLS_MC:
+ bpf_tail_call(skb, &prog_array_init, PARSE_MPLS);
+ break;
+ case ETH_P_IP:
+ bpf_tail_call(skb, &prog_array_init, PARSE_IP);
+ break;
+ case ETH_P_IPV6:
+ bpf_tail_call(skb, &prog_array_init, PARSE_IPV6);
+ break;
+ }
+}
+
+SEC("socket")
+int main_prog(struct __sk_buff *skb)
+{
+ __u32 nhoff = ETH_HLEN;
+ __u32 proto = load_half(skb, 12);
+
+ skb->cb[0] = nhoff;
+ parse_eth_proto(skb, proto);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
new file mode 100644
index 000000000000..56044acbd25d
--- /dev/null
+++ b/samples/bpf/sockex3_user.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <assert.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "sock_example.h"
+#include <unistd.h>
+#include <arpa/inet.h>
+
+struct flow_key_record {
+ __be32 src;
+ __be32 dst;
+ union {
+ __be32 ports;
+ __be16 port16[2];
+ };
+ __u32 ip_proto;
+};
+
+struct pair {
+ __u64 packets;
+ __u64 bytes;
+};
+
+int main(int argc, char **argv)
+{
+ int i, sock, fd, main_prog_fd, hash_map_fd;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
+ if (hash_map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* find BPF main program */
+ main_prog_fd = 0;
+ bpf_object__for_each_program(prog, obj) {
+ fd = bpf_program__fd(prog);
+
+ if (!strcmp(bpf_program__name(prog), "main_prog"))
+ main_prog_fd = fd;
+ }
+
+ if (main_prog_fd == 0) {
+ fprintf(stderr, "ERROR: can't find main_prog\n");
+ goto cleanup;
+ }
+
+ sock = open_raw_sock("lo");
+
+ /* attach BPF program to socket */
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
+ sizeof(__u32)) == 0);
+
+ if (argc > 1)
+ f = popen("ping -4 -c5 localhost", "r");
+ else
+ f = popen("netperf -l 4 localhost", "r");
+ (void) f;
+
+ for (i = 0; i < 5; i++) {
+ struct flow_key_record key = {}, next_key;
+ struct pair value;
+
+ sleep(1);
+ printf("IP src.port -> dst.port bytes packets\n");
+ while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
+ printf("%s.%05d -> %s.%05d %12lld %12lld\n",
+ inet_ntoa((struct in_addr){htonl(next_key.src)}),
+ next_key.port16[0],
+ inet_ntoa((struct in_addr){htonl(next_key.dst)}),
+ next_key.port16[1],
+ value.bytes, value.packets);
+ key = next_key;
+ }
+ }
+
+cleanup:
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/spintest.bpf.c b/samples/bpf/spintest.bpf.c
new file mode 100644
index 000000000000..cba5a9d50783
--- /dev/null
+++ b/samples/bpf/spintest.bpf.c
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016, Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, long);
+ __uint(max_entries, 1024);
+} my_map SEC(".maps");
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(long));
+ __uint(value_size, sizeof(long));
+ __uint(max_entries, 1024);
+} my_map2 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
+
+#define PROG(foo) \
+int foo(struct pt_regs *ctx) \
+{ \
+ long v = PT_REGS_IP(ctx), *val; \
+\
+ val = bpf_map_lookup_elem(&my_map, &v); \
+ bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \
+ bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \
+ bpf_map_delete_elem(&my_map2, &v); \
+ bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \
+ return 0; \
+}
+
+/* add kprobes to all possible *spin* functions */
+SEC("kprobe.multi/spin_*lock*")PROG(spin_lock)
+SEC("kprobe.multi/*_spin_on_owner")PROG(spin_on_owner)
+SEC("kprobe.multi/_raw_spin_*lock*")PROG(raw_spin_lock)
+
+/* and to inner bpf helpers */
+SEC("kprobe/htab_map_update_elem")PROG(p15)
+SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16)
+SEC("kprobe/htab_map_alloc")PROG(p17)
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
new file mode 100644
index 000000000000..55971edb1088
--- /dev/null
+++ b/samples/bpf/spintest_user.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "trace_helpers.h"
+
+int main(int ac, char **argv)
+{
+ struct bpf_object *obj = NULL;
+ struct bpf_link *links[20];
+ long key, next_key, value;
+ struct bpf_program *prog;
+ int map_fd, i, j = 0;
+ char filename[256];
+ struct ksym *sym;
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 2;
+ }
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+
+ for (i = 0; i < 5; i++) {
+ key = 0;
+ printf("kprobing funcs:");
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
+ assert(next_key == value);
+ sym = ksym_search(value);
+ key = next_key;
+ if (!sym) {
+ printf("ksym not found. Is kallsyms loaded?\n");
+ continue;
+ }
+
+ printf(" %s", sym->name);
+ }
+ if (key)
+ printf("\n");
+ key = 0;
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0)
+ bpf_map_delete_elem(map_fd, &next_key);
+ sleep(1);
+ }
+
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
new file mode 100644
index 000000000000..a6e600f3d477
--- /dev/null
+++ b/samples/bpf/syscall_nrs.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/unistd.h>
+#include <linux/kbuild.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+
+#define SYSNR(_NR) DEFINE(SYS ## _NR, _NR)
+
+void syscall_defines(void)
+{
+ COMMENT("Linux system call numbers.");
+ SYSNR(__NR_write);
+ SYSNR(__NR_read);
+#ifdef __NR_mmap2
+ SYSNR(__NR_mmap2);
+#endif
+#ifdef __NR_mmap
+ SYSNR(__NR_mmap);
+#endif
+
+}
+
+#pragma GCC diagnostic pop
diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c
new file mode 100644
index 000000000000..58fef969a60e
--- /dev/null
+++ b/samples/bpf/syscall_tp_kern.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#if !defined(__aarch64__)
+struct syscalls_enter_open_args {
+ unsigned long long unused;
+ long syscall_nr;
+ long filename_ptr;
+ long flags;
+ long mode;
+};
+#endif
+
+struct syscalls_exit_open_args {
+ unsigned long long unused;
+ long syscall_nr;
+ long ret;
+};
+
+struct syscalls_enter_open_at_args {
+ unsigned long long unused;
+ long syscall_nr;
+ long long dfd;
+ long filename_ptr;
+ long flags;
+ long mode;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u32);
+ __uint(max_entries, 1);
+} enter_open_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, u32);
+ __uint(max_entries, 1);
+} exit_open_map SEC(".maps");
+
+static __always_inline void count(void *map)
+{
+ u32 key = 0;
+ u32 *value, init_val = 1;
+
+ value = bpf_map_lookup_elem(map, &key);
+ if (value)
+ *value += 1;
+ else
+ bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST);
+}
+
+#if !defined(__aarch64__)
+SEC("tracepoint/syscalls/sys_enter_open")
+int trace_enter_open(struct syscalls_enter_open_args *ctx)
+{
+ count(&enter_open_map);
+ return 0;
+}
+#endif
+
+SEC("tracepoint/syscalls/sys_enter_openat")
+int trace_enter_open_at(struct syscalls_enter_open_at_args *ctx)
+{
+ count(&enter_open_map);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_openat2")
+int trace_enter_open_at2(struct syscalls_enter_open_at_args *ctx)
+{
+ count(&enter_open_map);
+ return 0;
+}
+
+#if !defined(__aarch64__)
+SEC("tracepoint/syscalls/sys_exit_open")
+int trace_enter_exit(struct syscalls_exit_open_args *ctx)
+{
+ count(&exit_open_map);
+ return 0;
+}
+#endif
+
+SEC("tracepoint/syscalls/sys_exit_openat")
+int trace_enter_exit_at(struct syscalls_exit_open_args *ctx)
+{
+ count(&exit_open_map);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_exit_openat2")
+int trace_enter_exit_at2(struct syscalls_exit_open_args *ctx)
+{
+ count(&exit_open_map);
+ return 0;
+}
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
new file mode 100644
index 000000000000..7a09ac74fac0
--- /dev/null
+++ b/samples/bpf/syscall_tp_user.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <errno.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
+ * This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
+ */
+
+static void usage(const char *cmd)
+{
+ printf("USAGE: %s [-i nr_tests] [-h]\n", cmd);
+ printf(" -i nr_tests # rounds of test to run\n");
+ printf(" -h # help\n");
+}
+
+static void verify_map(int map_id)
+{
+ __u32 key = 0;
+ __u32 val;
+
+ if (bpf_map_lookup_elem(map_id, &key, &val) != 0) {
+ fprintf(stderr, "map_lookup failed: %s\n", strerror(errno));
+ return;
+ }
+ if (val == 0) {
+ fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
+ return;
+ }
+
+ printf("verify map:%d val: %d\n", map_id, val);
+
+ val = 0;
+ if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
+ fprintf(stderr, "map_update failed: %s\n", strerror(errno));
+ return;
+ }
+}
+
+static int test(char *filename, int nr_tests)
+{
+ int map0_fds[nr_tests], map1_fds[nr_tests], fd, i, j = 0;
+ struct bpf_link **links = NULL;
+ struct bpf_object *objs[nr_tests];
+ struct bpf_program *prog;
+
+ for (i = 0; i < nr_tests; i++) {
+ objs[i] = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(objs[i])) {
+ fprintf(stderr, "opening BPF object file failed\n");
+ objs[i] = NULL;
+ goto cleanup;
+ }
+
+ /* One-time initialization */
+ if (!links) {
+ int nr_progs = 0;
+
+ bpf_object__for_each_program(prog, objs[i])
+ nr_progs += 1;
+
+ links = calloc(nr_progs * nr_tests, sizeof(struct bpf_link *));
+
+ if (!links)
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(objs[i])) {
+ fprintf(stderr, "loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
+ "enter_open_map");
+ map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
+ "exit_open_map");
+ if (map0_fds[i] < 0 || map1_fds[i] < 0) {
+ fprintf(stderr, "finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, objs[i]) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+ printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]);
+ }
+
+ /* current load_bpf_file has perf_event_open default pid = -1
+ * and cpu = 0, which permits attached bpf execution on
+ * all cpus for all pid's. bpf program execution ignores
+ * cpu affinity.
+ */
+ /* trigger some "open" operations */
+ fd = open(filename, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "open failed: %s\n", strerror(errno));
+ return 1;
+ }
+ close(fd);
+
+ /* verify the map */
+ for (i = 0; i < nr_tests; i++) {
+ verify_map(map0_fds[i]);
+ verify_map(map1_fds[i]);
+ }
+
+cleanup:
+ if (links) {
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ free(links);
+ }
+
+ for (i--; i >= 0; i--)
+ bpf_object__close(objs[i]);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int opt, nr_tests = 1;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "i:h")) != -1) {
+ switch (opt) {
+ case 'i':
+ nr_tests = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ return 0;
+ }
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ return test(filename, nr_tests);
+}
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000000000000..186ac0a79c0a
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("kprobe/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ return 0;
+}
+
+SEC("kretprobe/__blk_account_io_done")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000000000000..1e61f2180470
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <linux/perf_event.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+static struct bpf_program *progs[2];
+static struct bpf_link *links[2];
+
+#define CHECK_PERROR_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("FAIL: %s:\n", __func__); \
+ perror(" "); \
+ return -1; \
+ } \
+})
+
+#define CHECK_AND_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) \
+ return -1; \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ errno = 0;
+ ret = (int)strtol(buf, NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+ CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+ errno = 0;
+ ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+static int test_debug_fs_kprobe(int link_idx, const char *fn_name,
+ __u32 expected_fd_type)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err, event_fd;
+ char buf[256];
+
+ len = sizeof(buf);
+ event_fd = bpf_link__fd(links[link_idx]);
+ err = bpf_task_fd_query(getpid(), event_fd, 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+ __func__, link_idx, fn_name);
+ perror(" :");
+ return -1;
+ }
+ if (strcmp(buf, fn_name) != 0 ||
+ fd_type != expected_fd_type ||
+ probe_offset != 0x0 || probe_addr != 0x0) {
+ printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+ link_idx);
+ printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+ " probe_addr: 0x%llx\n",
+ buf, fd_type, probe_offset, probe_addr);
+ return -1;
+ }
+ return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+ const char *name, __u64 offset, __u64 addr, bool is_return,
+ char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+ int is_return_bit = bpf_get_retprobe_bit(event_type);
+ int type = bpf_find_probe_type(event_type);
+ struct perf_event_attr attr = {};
+ struct bpf_link *link;
+ int fd, err = -1;
+
+ if (type < 0 || is_return_bit < 0) {
+ printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+ __func__, type, is_return_bit);
+ return err;
+ }
+
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ if (is_return)
+ attr.config |= 1 << is_return_bit;
+
+ if (name) {
+ attr.config1 = ptr_to_u64((void *)name);
+ attr.config2 = offset;
+ } else {
+ attr.config1 = 0;
+ attr.config2 = addr;
+ }
+ attr.size = sizeof(attr);
+ attr.type = type;
+
+ fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+ link = bpf_program__attach_perf_event(progs[0], fd);
+ if (libbpf_get_error(link)) {
+ printf("ERROR: bpf_program__attach_perf_event failed\n");
+ link = NULL;
+ close(fd);
+ goto cleanup;
+ }
+
+ CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+ prog_id, fd_type, probe_offset, probe_addr) < 0);
+ err = 0;
+
+cleanup:
+ bpf_link__destroy(link);
+ return err;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+ __u64 offset, __u64 addr, bool is_return,
+ __u32 expected_fd_type,
+ __u32 expected_ret_fd_type,
+ char *buf, __u32 buf_len)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 prog_id, fd_type;
+ int err;
+
+ err = test_nondebug_fs_kuprobe_common(event_type, name,
+ offset, addr, is_return,
+ buf, &buf_len, &prog_id,
+ &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, "
+ "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+ __func__, name ? name : "", offset, addr, is_return);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != expected_ret_fd_type) ||
+ (!is_return && fd_type != expected_fd_type)) {
+ printf("FAIL: %s, incorrect fd_type %u\n",
+ __func__, fd_type);
+ return -1;
+ }
+ if (name) {
+ if (strcmp(name, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+ __func__, probe_offset);
+ return -1;
+ }
+ } else {
+ if (buf_len != 0) {
+ printf("FAIL: %s, incorrect buf %p\n",
+ __func__, buf);
+ return -1;
+ }
+
+ if (probe_addr != addr) {
+ printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+ __func__, probe_addr);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+ char buf[256], event_alias[sizeof("test_1234567890")];
+ const char *event_type = "uprobe";
+ struct perf_event_attr attr = {};
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err = -1, res, kfd, efd;
+ struct bpf_link *link;
+ ssize_t bytes;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events",
+ event_type);
+ kfd = open(buf, O_WRONLY | O_TRUNC, 0);
+ CHECK_PERROR_RET(kfd < 0);
+
+ res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+ res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+ is_return ? 'r' : 'p', event_type, event_alias,
+ binary_path, offset);
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+ CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+ close(kfd);
+ kfd = -1;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s/id",
+ event_type, event_alias);
+ efd = open(buf, O_RDONLY, 0);
+ CHECK_PERROR_RET(efd < 0);
+
+ bytes = read(efd, buf, sizeof(buf));
+ CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+ close(efd);
+ buf[bytes] = '\0';
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+ link = bpf_program__attach_perf_event(progs[0], kfd);
+ if (libbpf_get_error(link)) {
+ printf("ERROR: bpf_program__attach_perf_event failed\n");
+ link = NULL;
+ close(kfd);
+ goto cleanup;
+ }
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+ (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+ printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+ fd_type);
+ return -1;
+ }
+ if (strcmp(binary_path, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+ probe_offset);
+ return -1;
+ }
+ err = 0;
+
+cleanup:
+ bpf_link__destroy(link);
+ return err;
+}
+
+int main(int argc, char **argv)
+{
+ extern char __executable_start;
+ char filename[256], buf[256];
+ __u64 uprobe_file_offset;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int i = 0, err = -1;
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return err;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return err;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ progs[i] = prog;
+ links[i] = bpf_program__attach(progs[i]);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
+ }
+
+ /* test two functions in the corresponding *_kern.c file */
+ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
+ BPF_FD_TYPE_KPROBE));
+ CHECK_AND_RET(test_debug_fs_kprobe(1, "__blk_account_io_done",
+ BPF_FD_TYPE_KRETPROBE));
+
+ /* test nondebug fs kprobe */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#ifdef __x86_64__
+ /* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#endif
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ true, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ NULL, 0));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ 0, 0));
+
+ /* test nondebug fs uprobe */
+ /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+ * and the default linker script, which defines __executable_start as
+ * the start of the .text section. The calculation could be different
+ * on different systems with different compilers. The right way is
+ * to parse the ELF file. We took a shortcut here.
+ */
+ uprobe_file_offset = (unsigned long)main - (unsigned long)&__executable_start;
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, false,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, true,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+
+ /* test debug fs uprobe */
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ false));
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ true));
+ err = 0;
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
+ return err;
+}
diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh
new file mode 100755
index 000000000000..a28a8fc99dbe
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+[[ -z $TC ]] && TC='tc'
+[[ -z $IP ]] && IP='ip'
+
+REDIRECT_USER='./tc_l2_redirect'
+REDIRECT_BPF='./tc_l2_redirect_kern.o'
+
+RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
+IPV6_DISABLED=$(< /proc/sys/net/ipv6/conf/all/disable_ipv6)
+IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)
+
+function config_common {
+ local tun_type=$1
+
+ $IP netns add ns1
+ $IP netns add ns2
+ $IP link add ve1 type veth peer name vens1
+ $IP link add ve2 type veth peer name vens2
+ $IP link set dev ve1 up
+ $IP link set dev ve2 up
+ $IP link set dev ve1 mtu 1500
+ $IP link set dev ve2 mtu 1500
+ $IP link set dev vens1 netns ns1
+ $IP link set dev vens2 netns ns2
+
+ $IP -n ns1 link set dev lo up
+ $IP -n ns1 link set dev vens1 up
+ $IP -n ns1 addr add 10.1.1.101/24 dev vens1
+ $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
+ $IP -n ns1 route add default via 10.1.1.1 dev vens1
+ $IP -n ns1 route add default via 2401:db01::1 dev vens1
+
+ $IP -n ns2 link set dev lo up
+ $IP -n ns2 link set dev vens2 up
+ $IP -n ns2 addr add 10.2.1.102/24 dev vens2
+ $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
+ $IP -n ns2 addr add 10.10.1.102 dev lo
+ $IP -n ns2 addr add 2401:face::66/64 dev lo nodad
+ $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
+ $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
+ $IP -n ns2 link set dev ipt2 up
+ $IP -n ns2 link set dev ip6t2 up
+ $IP netns exec ns2 $TC qdisc add dev vens2 clsact
+ $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
+ if [[ $tun_type == "ipip" ]]; then
+ $IP -n ns2 route add 10.1.1.0/24 dev ipt2
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
+ else
+ $IP -n ns2 route add 10.1.1.0/24 dev ip6t2
+ $IP -n ns2 route add 2401:db01::/64 dev ip6t2
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
+ fi
+
+ $IP addr add 10.1.1.1/24 dev ve1
+ $IP addr add 2401:db01::1/64 dev ve1 nodad
+ $IP addr add 10.2.1.1/24 dev ve2
+ $IP addr add 2401:db02::1/64 dev ve2 nodad
+
+ $TC qdisc add dev ve2 clsact
+ $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward
+
+ sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ sysctl -q -w net.ipv6.conf.all.forwarding=1
+ sysctl -q -w net.ipv6.conf.all.disable_ipv6=0
+}
+
+function cleanup {
+ set +e
+ [[ -z $DEBUG ]] || set +x
+ $IP netns delete ns1 >& /dev/null
+ $IP netns delete ns2 >& /dev/null
+ $IP link del ve1 >& /dev/null
+ $IP link del ve2 >& /dev/null
+ $IP link del ipt >& /dev/null
+ $IP link del ip6t >& /dev/null
+ sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
+ sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
+ sysctl -q -w net.ipv6.conf.all.disable_ipv6=$IPV6_DISABLED
+ rm -f /sys/fs/bpf/tc/globals/tun_iface
+ [[ -z $DEBUG ]] || set -x
+ set -e
+}
+
+function l2_to_ipip {
+ echo -n "l2_to_ipip $1: "
+
+ local dir=$1
+
+ config_common ipip
+
+ $IP link add ipt type ipip external
+ $IP link set dev ipt up
+ sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
+ sysctl -q -w net.ipv4.conf.ipt.forwarding=1
+
+ if [[ $dir == "egress" ]]; then
+ $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
+ $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
+ sysctl -q -w net.ipv4.conf.ve1.forwarding=1
+ else
+ $TC qdisc add dev ve1 clsact
+ $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
+ fi
+
+ $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)
+
+ $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
+
+ if [[ $dir == "egress" ]]; then
+ # test direct egress to ve2 (i.e. not forwarding from
+ # ve1 to ve2).
+ ping -c1 10.10.1.102 >& /dev/null
+ fi
+
+ cleanup
+
+ echo "OK"
+}
+
+function l2_to_ip6tnl {
+ echo -n "l2_to_ip6tnl $1: "
+
+ local dir=$1
+
+ config_common ip6tnl
+
+ $IP link add ip6t type ip6tnl mode any external
+ $IP link set dev ip6t up
+ sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
+ sysctl -q -w net.ipv4.conf.ip6t.forwarding=1
+
+ if [[ $dir == "egress" ]]; then
+ $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
+ $IP route add 2401:face::/64 via 2401:db02::66 dev ve2
+ $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
+ sysctl -q -w net.ipv4.conf.ve1.forwarding=1
+ else
+ $TC qdisc add dev ve1 clsact
+ $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
+ fi
+
+ $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)
+
+ $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
+ $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null
+
+ if [[ $dir == "egress" ]]; then
+ # test direct egress to ve2 (i.e. not forwarding from
+ # ve1 to ve2).
+ ping -c1 10.10.1.102 >& /dev/null
+ ping -6 -c1 2401:face::66 >& /dev/null
+ fi
+
+ cleanup
+
+ echo "OK"
+}
+
+cleanup
+test_names="l2_to_ipip l2_to_ip6tnl"
+test_dirs="ingress egress"
+if [[ $# -ge 2 ]]; then
+ test_names=$1
+ test_dirs=$2
+elif [[ $# -ge 1 ]]; then
+ test_names=$1
+fi
+
+for t in $test_names; do
+ for d in $test_dirs; do
+ $t $d
+ done
+done
diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c
new file mode 100644
index 000000000000..b19fa9b88fe0
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_kern.c
@@ -0,0 +1,231 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <bpf/bpf_helpers.h>
+
+#define _htonl __builtin_bswap32
+
+#define PIN_GLOBAL_NS 2
+struct bpf_elf_map {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+};
+
+/* copy of 'struct ethhdr' without __packed */
+struct eth_hdr {
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ unsigned short h_proto;
+};
+
+struct bpf_elf_map SEC("maps") tun_iface = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .size_key = sizeof(int),
+ .size_value = sizeof(int),
+ .pinning = PIN_GLOBAL_NS,
+ .max_elem = 1,
+};
+
+static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr)
+{
+ if (eth_proto == htons(ETH_P_IP))
+ return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100);
+ else if (eth_proto == htons(ETH_P_IPV6))
+ return (daddr == _htonl(0x2401face));
+
+ return false;
+}
+
+SEC("l2_to_iptun_ingress_forward")
+int _l2_to_iptun_ingress_forward(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n";
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (iph->protocol != IPPROTO_IPIP)
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex,
+ _htonl(iph->daddr));
+ return bpf_redirect(*ifindex, BPF_F_INGRESS);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n";
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (ip6h->nexthdr != IPPROTO_IPIP &&
+ ip6h->nexthdr != IPPROTO_IPV6)
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex,
+ _htonl(ip6h->daddr.s6_addr32[0]),
+ _htonl(ip6h->daddr.s6_addr32[3]));
+ return bpf_redirect(*ifindex, BPF_F_INGRESS);
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("l2_to_iptun_ingress_redirect")
+int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
+ struct iphdr *iph = data + sizeof(*eth);
+ __be32 daddr = iph->daddr;
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, daddr))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex);
+ } else {
+ return TC_ACT_OK;
+ }
+
+ tkey.tunnel_id = 10000;
+ tkey.tunnel_ttl = 64;
+ tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
+ return bpf_redirect(*ifindex, 0);
+}
+
+SEC("l2_to_ip6tun_ingress_redirect")
+int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key tkey = {};
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int key = 0, *ifindex;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ ifindex = bpf_map_lookup_elem(&tun_iface, &key);
+ if (!ifindex)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, iph->daddr))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr),
+ *ifindex);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n";
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt6, sizeof(fmt6),
+ _htonl(ip6h->daddr.s6_addr32[0]), *ifindex);
+ } else {
+ return TC_ACT_OK;
+ }
+
+ tkey.tunnel_id = 10000;
+ tkey.tunnel_ttl = 64;
+ /* 2401:db02:0:0:0:0:0:66 */
+ tkey.remote_ipv6[0] = _htonl(0x2401db02);
+ tkey.remote_ipv6[1] = 0;
+ tkey.remote_ipv6[2] = 0;
+ tkey.remote_ipv6[3] = _htonl(0x00000066);
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6);
+ return bpf_redirect(*ifindex, 0);
+}
+
+SEC("drop_non_tun_vip")
+int _drop_non_tun_vip(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ struct eth_hdr *eth = data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (data + sizeof(*eth) > data_end)
+ return TC_ACT_OK;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*iph) > data_end)
+ return TC_ACT_OK;
+
+ if (is_vip_addr(eth->h_proto, iph->daddr))
+ return TC_ACT_SHOT;
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = data + sizeof(*eth);
+
+ if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+ return TC_ACT_OK;
+
+ if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c
new file mode 100644
index 000000000000..d11a6e1e9912
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_user.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include <bpf/bpf.h>
+
+static void usage(void)
+{
+ printf("Usage: tc_l2_ipip_redirect [...]\n");
+ printf(" -U <file> Update an already pinned BPF array\n");
+ printf(" -i <ifindex> Interface index\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ const char *pinned_file = NULL;
+ int ifindex = -1;
+ int array_key = 0;
+ int array_fd = -1;
+ int ret = -1;
+ int opt;
+
+ while ((opt = getopt(argc, argv, "F:U:i:")) != -1) {
+ switch (opt) {
+ /* General args */
+ case 'U':
+ pinned_file = optarg;
+ break;
+ case 'i':
+ ifindex = atoi(optarg);
+ break;
+ default:
+ usage();
+ goto out;
+ }
+ }
+
+ if (ifindex < 0 || !pinned_file) {
+ usage();
+ goto out;
+ }
+
+ array_fd = bpf_obj_get(pinned_file);
+ if (array_fd < 0) {
+ fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+ pinned_file, strerror(errno), errno);
+ goto out;
+ }
+
+ /* bpf_tunnel_key.remote_ipv4 expects host byte orders */
+ ret = bpf_map_update_elem(array_fd, &array_key, &ifindex, 0);
+ if (ret) {
+ perror("bpf_map_update_elem");
+ goto out;
+ }
+
+out:
+ if (array_fd != -1)
+ close(array_fd);
+ return ret;
+}
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
new file mode 100644
index 000000000000..e9356130f84e
--- /dev/null
+++ b/samples/bpf/tcbpf1_kern.c
@@ -0,0 +1,91 @@
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+/* compiler workaround */
+#define _htonl __builtin_bswap32
+
+static inline void set_dst_mac(struct __sk_buff *skb, char *mac)
+{
+ bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1);
+}
+
+#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check))
+#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos))
+
+static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
+{
+ __u8 old_tos = load_byte(skb, TOS_OFF);
+
+ bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
+ bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
+}
+
+#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check))
+#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr))
+
+#define IS_PSEUDO 0x10
+
+static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
+{
+ __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
+
+ bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
+ bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+ bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+}
+
+#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
+static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
+{
+ __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
+
+ bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
+ bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
+}
+
+SEC("classifier")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
+ long *value;
+
+ if (proto == IPPROTO_TCP) {
+ set_ip_tos(skb, 8);
+ set_tcp_ip_src(skb, 0xA010101);
+ set_tcp_dest_port(skb, 5001);
+ }
+
+ return 0;
+}
+SEC("redirect_xmit")
+int _redirect_xmit(struct __sk_buff *skb)
+{
+ return bpf_redirect(skb->ifindex + 1, 0);
+}
+SEC("redirect_recv")
+int _redirect_recv(struct __sk_buff *skb)
+{
+ return bpf_redirect(skb->ifindex + 1, 1);
+}
+SEC("clone_redirect_xmit")
+int _clone_redirect_xmit(struct __sk_buff *skb)
+{
+ bpf_clone_redirect(skb, skb->ifindex + 1, 0);
+ return TC_ACT_SHOT;
+}
+SEC("clone_redirect_recv")
+int _clone_redirect_recv(struct __sk_buff *skb)
+{
+ bpf_clone_redirect(skb, skb->ifindex + 1, 1);
+ return TC_ACT_SHOT;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
new file mode 100644
index 000000000000..822b0742b815
--- /dev/null
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -0,0 +1,71 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set base_rtt to 80us when host is running TCP-NV and
+ * both hosts are in the same datacenter (as determined by IPv6 prefix).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_basertt(struct bpf_sock_ops *skops)
+{
+ char cong[20];
+ char nv[] = "nv";
+ int rv = 0, n;
+ int op;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_BASE_RTT:
+ n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ if (!n && !__builtin_memcmp(cong, nv, sizeof(nv))) {
+ /* Set base_rtt to 80us */
+ rv = 80;
+ } else if (n) {
+ rv = n;
+ } else {
+ rv = -1;
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
new file mode 100644
index 000000000000..78e247f62108
--- /dev/null
+++ b/samples/bpf/tcp_bpf.readme
@@ -0,0 +1,28 @@
+This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
+programs. These programs attach to a cgroupv2. The following commands create
+a cgroupv2 and attach a bash shell to the group.
+
+ mkdir -p /tmp/cgroupv2
+ mount -t cgroup2 none /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2/foo
+ bash
+ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+
+Anything that runs under this shell belongs to the foo cgroupv2. To load
+(attach) one of the tcp_*_kern.o programs:
+
+ bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog
+ bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
+ bpftool prog tracelog
+
+"bpftool prog tracelog" will continue to run printing the BPF log buffer.
+The tcp_*_kern.o programs use special print functions to print logging
+information (if enabled by the ifdef).
+
+If using netperf/netserver to create traffic, you need to run them under the
+cgroupv2 to which the BPF programs are attached (i.e. under bash shell
+attached to the cgroupv2).
+
+To remove (unattach) a socket_ops BPF program from a cgroupv2:
+
+ bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000000000000..6a80d08952ad
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,81 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets and send
+ * and receive buffers to 1.5MB. This would usually be done after
+ * doing appropriate checks that indicate the hosts are far enough
+ * away (i.e. large RTT).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_bufs(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ /* Nothing to do */
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000000000000..e88bd9ab0695
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
+ * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
+ * the same datacenter. For his example, we assume they are within the same
+ * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_clamp(struct bpf_sock_ops *skops)
+{
+ int bufsize = 150000;
+ int to_init = 10;
+ int clamp = 100;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 0;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check that both hosts are within same datacenter. For this example
+ * it is the case when the first 5.5 bytes of their IPv6 addresses are
+ * the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_TIMEOUT_INIT:
+ rv = to_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
+ &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000000000000..339415eac477
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set congestion control to dctcp when both hosts are
+ * in the same datacenter (as determined by IPv6 prefix).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_cong(struct bpf_sock_ops *skops)
+{
+ char cong[] = "dctcp";
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_NEEDS_ECN:
+ rv = 1;
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c
new file mode 100644
index 000000000000..e80d3afd24bd
--- /dev/null
+++ b/samples/bpf/tcp_dumpstats_kern.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Refer to samples/bpf/tcp_bpf.readme for the instructions on
+ * how to run this sample program.
+ */
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define INTERVAL 1000000000ULL
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __u32 type;
+ __u32 map_flags;
+ int *key;
+ __u64 *value;
+} bpf_next_dump SEC(".maps") = {
+ .type = BPF_MAP_TYPE_SK_STORAGE,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+ struct bpf_tcp_sock *tcp_sk;
+ struct bpf_sock *sk;
+ __u64 *next_dump;
+ __u64 now;
+
+ switch (ctx->op) {
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+ return 1;
+ case BPF_SOCK_OPS_RTT_CB:
+ break;
+ default:
+ return 1;
+ }
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!next_dump)
+ return 1;
+
+ now = bpf_ktime_get_ns();
+ if (now < *next_dump)
+ return 1;
+
+ tcp_sk = bpf_tcp_sock(sk);
+ if (!tcp_sk)
+ return 1;
+
+ *next_dump = now + INTERVAL;
+
+ bpf_printk("dsack_dups=%u delivered=%u\n",
+ tcp_sk->dsack_dups, tcp_sk->delivered);
+ bpf_printk("delivered_ce=%u icsk_retransmits=%u\n",
+ tcp_sk->delivered_ce, tcp_sk->icsk_retransmits);
+
+ return 1;
+}
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000000000000..d1444557358e
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,83 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial congestion window and initial receive
+ * window to 40 packets and send and receive buffers to 1.5MB. This
+ * would usually be done after doing appropriate checks that indicate
+ * the hosts are far enough away (i.e. large RTT).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_iw(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int iw = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
+ sizeof(iw));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000000000000..223d9c23b10c
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets when using IPv6
+ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
+ * example that means both hosts are not the same datacenter).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_rwnd(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) !=
+ 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for RWND_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_RWND_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are not the same
+ * then both hosts are not in the same datacenter
+ * so use a larger initial advertized window (40 packets)
+ */
+ if (skops->local_ip6[0] != skops->remote_ip6[0] ||
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) !=
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000))
+ rv = 40;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000000000000..d58004eef124
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
+ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
+ * that means both hosts are in the same datacenter).
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_synrto(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for TIMEOUT_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are the same
+ * then both hosts are in the same datacenter
+ * so use an RTO of 10ms
+ */
+ if (skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000))
+ rv = 10;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c
new file mode 100644
index 000000000000..953fedc79ce1
--- /dev/null
+++ b/samples/bpf/tcp_tos_reflect_kern.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ *
+ * BPF program to automatically reflect TOS option from received syn packet
+ *
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <linux/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_basertt(struct bpf_sock_ops *skops)
+{
+ char header[sizeof(struct ipv6hdr)];
+ struct ipv6hdr *hdr6;
+ struct iphdr *hdr;
+ int hdr_size = 0;
+ int save_syn = 1;
+ int tos = 0;
+ int rv = 0;
+ int op;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+ switch (op) {
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
+ &save_syn, sizeof(save_syn));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ if (skops->family == AF_INET)
+ hdr_size = sizeof(struct iphdr);
+ else
+ hdr_size = sizeof(struct ipv6hdr);
+ rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN,
+ header, hdr_size);
+ if (!rv) {
+ if (skops->family == AF_INET) {
+ hdr = (struct iphdr *) header;
+ tos = hdr->tos;
+ if (tos != 0)
+ bpf_setsockopt(skops, SOL_IP, IP_TOS,
+ &tos, sizeof(tos));
+ } else {
+ hdr6 = (struct ipv6hdr *) header;
+ tos = ((hdr6->priority) << 4 |
+ (hdr6->flow_lbl[0]) >> 4);
+ if (tos)
+ bpf_setsockopt(skops, SOL_IPV6,
+ IPV6_TCLASS,
+ &tos, sizeof(tos));
+ }
+ rv = 0;
+ }
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh
new file mode 100755
index 000000000000..aaddd67b37ff
--- /dev/null
+++ b/samples/bpf/test_cls_bpf.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+function pktgen {
+ ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \
+ -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4
+ local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'`
+ if [ "$dropped" == "0," ]; then
+ echo "FAIL"
+ else
+ echo "Successfully filtered " $dropped " packets"
+ fi
+}
+
+function test {
+ echo -n "Loading bpf program '$2'... "
+ tc qdisc add dev $IFC clsact
+ tc filter add dev $IFC ingress bpf da obj $1 sec $2
+ local status=$?
+ if [ $status -ne 0 ]; then
+ echo "FAIL"
+ else
+ echo "ok"
+ pktgen
+ fi
+ tc qdisc del dev $IFC clsact
+}
+
+IFC=test_veth
+
+ip link add name $IFC type veth peer name pair_$IFC
+ip link set $IFC up
+ip link set pair_$IFC up
+
+test ./parse_simple.o simple
+test ./parse_varlen.o varlen
+test ./parse_ldabs.o ldabs
+ip link del dev $IFC
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
new file mode 100644
index 000000000000..1c161276d57b
--- /dev/null
+++ b/samples/bpf/test_lru_dist.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016 Facebook
+ */
+#define _GNU_SOURCE
+#include <linux/types.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#ifndef offsetof
+# define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
+#endif
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+static int nr_cpus;
+static unsigned long long *dist_keys;
+static unsigned int dist_key_counts;
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void __list_del_entry(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add(list, head);
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_last_entry(ptr, type, member) \
+ list_entry((ptr)->prev, type, member)
+
+struct pfect_lru_node {
+ struct list_head list;
+ unsigned long long key;
+};
+
+struct pfect_lru {
+ struct list_head list;
+ struct pfect_lru_node *free_nodes;
+ unsigned int cur_size;
+ unsigned int lru_size;
+ unsigned int nr_unique;
+ unsigned int nr_misses;
+ unsigned int total;
+ int map_fd;
+};
+
+static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size,
+ unsigned int nr_possible_elems)
+{
+ lru->map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL,
+ sizeof(unsigned long long),
+ sizeof(struct pfect_lru_node *),
+ nr_possible_elems, NULL);
+ assert(lru->map_fd != -1);
+
+ lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node));
+ assert(lru->free_nodes);
+
+ INIT_LIST_HEAD(&lru->list);
+ lru->cur_size = 0;
+ lru->lru_size = lru_size;
+ lru->nr_unique = lru->nr_misses = lru->total = 0;
+}
+
+static void pfect_lru_destroy(struct pfect_lru *lru)
+{
+ close(lru->map_fd);
+ free(lru->free_nodes);
+}
+
+static int pfect_lru_lookup_or_insert(struct pfect_lru *lru,
+ unsigned long long key)
+{
+ struct pfect_lru_node *node = NULL;
+ int seen = 0;
+
+ lru->total++;
+ if (!bpf_map_lookup_elem(lru->map_fd, &key, &node)) {
+ if (node) {
+ list_move(&node->list, &lru->list);
+ return 1;
+ }
+ seen = 1;
+ }
+
+ if (lru->cur_size < lru->lru_size) {
+ node = &lru->free_nodes[lru->cur_size++];
+ INIT_LIST_HEAD(&node->list);
+ } else {
+ struct pfect_lru_node *null_node = NULL;
+
+ node = list_last_entry(&lru->list,
+ struct pfect_lru_node,
+ list);
+ bpf_map_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST);
+ }
+
+ node->key = key;
+ list_move(&node->list, &lru->list);
+
+ lru->nr_misses++;
+ if (seen) {
+ assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_EXIST));
+ } else {
+ lru->nr_unique++;
+ assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST));
+ }
+
+ return seen;
+}
+
+static unsigned int read_keys(const char *dist_file,
+ unsigned long long **keys)
+{
+ struct stat fst;
+ unsigned long long *retkeys;
+ unsigned int counts = 0;
+ int dist_fd;
+ char *b, *l;
+ int i;
+
+ dist_fd = open(dist_file, 0);
+ assert(dist_fd != -1);
+
+ assert(fstat(dist_fd, &fst) == 0);
+ b = malloc(fst.st_size);
+ assert(b);
+
+ assert(read(dist_fd, b, fst.st_size) == fst.st_size);
+ close(dist_fd);
+ for (i = 0; i < fst.st_size; i++) {
+ if (b[i] == '\n')
+ counts++;
+ }
+ counts++; /* in case the last line has no \n */
+
+ retkeys = malloc(counts * sizeof(unsigned long long));
+ assert(retkeys);
+
+ counts = 0;
+ for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n"))
+ retkeys[counts++] = strtoull(l, NULL, 10);
+ free(b);
+
+ *keys = retkeys;
+
+ return counts;
+}
+
+static int create_map(int map_type, int map_flags, unsigned int size)
+{
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_flags = map_flags,
+ );
+ int map_fd;
+
+ map_fd = bpf_map_create(map_type, NULL, sizeof(unsigned long long),
+ sizeof(unsigned long long), size, &opts);
+
+ if (map_fd == -1)
+ perror("bpf_create_map");
+
+ return map_fd;
+}
+
+static int sched_next_online(int pid, int next_to_try)
+{
+ cpu_set_t cpuset;
+
+ if (next_to_try == nr_cpus)
+ return -1;
+
+ while (next_to_try < nr_cpus) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(next_to_try++, &cpuset);
+ if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
+ break;
+ }
+
+ return next_to_try;
+}
+
+static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data),
+ void *data)
+{
+ int next_sched_cpu = 0;
+ pid_t pid[tasks];
+ int i;
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ next_sched_cpu = sched_next_online(0, next_sched_cpu);
+ fn(i, data);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("couldn't spawn #%d process\n", i);
+ exit(1);
+ }
+ /* It is mostly redundant and just allow the parent
+ * process to update next_shced_cpu for the next child
+ * process
+ */
+ next_sched_cpu = sched_next_online(pid[i], next_sched_cpu);
+ }
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void do_test_lru_dist(int task, void *data)
+{
+ unsigned int nr_misses = 0;
+ struct pfect_lru pfect_lru;
+ unsigned long long key, value = 1234;
+ unsigned int i;
+
+ unsigned int lru_map_fd = ((unsigned int *)data)[0];
+ unsigned int lru_size = ((unsigned int *)data)[1];
+ unsigned long long key_offset = task * dist_key_counts;
+
+ pfect_lru_init(&pfect_lru, lru_size, dist_key_counts);
+
+ for (i = 0; i < dist_key_counts; i++) {
+ key = dist_keys[i] + key_offset;
+
+ pfect_lru_lookup_or_insert(&pfect_lru, key);
+
+ if (!bpf_map_lookup_elem(lru_map_fd, &key, &value))
+ continue;
+
+ if (bpf_map_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) {
+ printf("bpf_map_update_elem(lru_map_fd, %llu): errno:%d\n",
+ key, errno);
+ assert(0);
+ }
+
+ nr_misses++;
+ }
+
+ printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, dist_key_counts, nr_misses,
+ dist_key_counts);
+ printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
+ task, pfect_lru.nr_unique, pfect_lru.total,
+ pfect_lru.nr_misses, pfect_lru.total);
+
+ pfect_lru_destroy(&pfect_lru);
+ close(lru_map_fd);
+}
+
+static void test_parallel_lru_dist(int map_type, int map_flags,
+ int nr_tasks, unsigned int lru_size)
+{
+ int child_data[2];
+ int lru_map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_cpus * lru_size);
+ else
+ lru_map_fd = create_map(map_type, map_flags,
+ nr_tasks * lru_size);
+ assert(lru_map_fd != -1);
+
+ child_data[0] = lru_map_fd;
+ child_data[1] = lru_size;
+
+ run_parallel(nr_tasks, do_test_lru_dist, child_data);
+
+ close(lru_map_fd);
+}
+
+static void test_lru_loss0(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ unsigned int old_unused_losses = 0;
+ unsigned int new_unused_losses = 0;
+ unsigned int used_losses = 0;
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 900 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 900);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++) {
+ int start_key, end_key;
+
+ assert(bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0);
+
+ start_key = 101;
+ end_key = min(key, 900);
+
+ while (start_key <= end_key) {
+ bpf_map_lookup_elem(map_fd, &start_key, value);
+ start_key++;
+ }
+ }
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value)) {
+ if (key <= 100)
+ old_unused_losses++;
+ else if (key <= 900)
+ used_losses++;
+ else
+ new_unused_losses++;
+ }
+ }
+
+ close(map_fd);
+
+ printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) "
+ "newer-elem-losses:%d(/100)\n",
+ old_unused_losses, used_losses, new_unused_losses);
+}
+
+static void test_lru_loss1(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int map_fd;
+ unsigned int nr_losses = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, 0) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags, 1000 * nr_cpus);
+ else
+ map_fd = create_map(map_type, map_flags, 1000);
+
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 1000; key++)
+ assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
+
+ for (key = 1; key <= 1000; key++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ }
+
+ close(map_fd);
+
+ printf("nr_losses:%d(/1000)\n", nr_losses);
+}
+
+static void do_test_parallel_lru_loss(int task, void *data)
+{
+ const unsigned int nr_stable_elems = 1000;
+ const unsigned int nr_repeats = 100000;
+
+ int map_fd = *(int *)data;
+ unsigned long long stable_base;
+ unsigned long long key, value[nr_cpus];
+ unsigned long long next_ins_key;
+ unsigned int nr_losses = 0;
+ unsigned int i;
+
+ stable_base = task * nr_repeats * 2 + 1;
+ next_ins_key = stable_base;
+ value[0] = 1234;
+ for (i = 0; i < nr_stable_elems; i++) {
+ assert(bpf_map_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST) == 0);
+ next_ins_key++;
+ }
+
+ for (i = 0; i < nr_repeats; i++) {
+ int rn;
+
+ rn = rand();
+
+ if (rn % 10) {
+ key = rn % nr_stable_elems + stable_base;
+ bpf_map_lookup_elem(map_fd, &key, value);
+ } else {
+ bpf_map_update_elem(map_fd, &next_ins_key, value,
+ BPF_NOEXIST);
+ next_ins_key++;
+ }
+ }
+
+ key = stable_base;
+ for (i = 0; i < nr_stable_elems; i++) {
+ if (bpf_map_lookup_elem(map_fd, &key, value))
+ nr_losses++;
+ key++;
+ }
+
+ printf(" task:%d nr_losses:%u\n", task, nr_losses);
+}
+
+static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks)
+{
+ int map_fd;
+
+ printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
+ map_flags);
+
+ /* Give 20% more than the active working set */
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ map_fd = create_map(map_type, map_flags,
+ nr_cpus * (1000 + 200));
+ else
+ map_fd = create_map(map_type, map_flags,
+ nr_tasks * (1000 + 200));
+
+ assert(map_fd != -1);
+
+ run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd);
+
+ close(map_fd);
+}
+
+int main(int argc, char **argv)
+{
+ int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
+ const char *dist_file;
+ int nr_tasks = 1;
+ int lru_size;
+ int f;
+
+ if (argc < 4) {
+ printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n",
+ argv[0]);
+ return -1;
+ }
+
+ dist_file = argv[1];
+ lru_size = atoi(argv[2]);
+ nr_tasks = atoi(argv[3]);
+
+ setbuf(stdout, NULL);
+
+ srand(time(NULL));
+
+ nr_cpus = bpf_num_possible_cpus();
+ assert(nr_cpus != -1);
+ printf("nr_cpus:%d\n\n", nr_cpus);
+
+ nr_tasks = min(nr_tasks, nr_cpus);
+
+ dist_key_counts = read_keys(dist_file, &dist_keys);
+ if (!dist_key_counts) {
+ printf("%s has no key\n", dist_file);
+ return -1;
+ }
+
+ for (f = 0; f < ARRAY_SIZE(map_flags); f++) {
+ test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
+ test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks);
+ test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
+ nr_tasks, lru_size);
+ printf("\n");
+ }
+
+ free(dist_keys);
+
+ return 0;
+}
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c
new file mode 100644
index 000000000000..9a13dbb81847
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.c
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include "vmlinux.h"
+#include "net_shared.h"
+#include <bpf/bpf_helpers.h>
+#include <string.h>
+
+# define printk(fmt, ...) \
+ ({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+ })
+
+#define CB_MAGIC 1234
+
+/* Test: Pass all packets through */
+SEC("nop")
+int do_nop(struct __sk_buff *skb)
+{
+ return BPF_OK;
+}
+
+/* Test: Verify context information can be accessed */
+SEC("test_ctx")
+int do_test_ctx(struct __sk_buff *skb)
+{
+ skb->cb[0] = CB_MAGIC;
+ printk("len %d hash %d protocol %d", skb->len, skb->hash,
+ skb->protocol);
+ printk("cb %d ingress_ifindex %d ifindex %d", skb->cb[0],
+ skb->ingress_ifindex, skb->ifindex);
+
+ return BPF_OK;
+}
+
+/* Test: Ensure skb->cb[] buffer is cleared */
+SEC("test_cb")
+int do_test_cb(struct __sk_buff *skb)
+{
+ printk("cb0: %x cb1: %x cb2: %x", skb->cb[0], skb->cb[1],
+ skb->cb[2]);
+ printk("cb3: %x cb4: %x", skb->cb[3], skb->cb[4]);
+
+ return BPF_OK;
+}
+
+/* Test: Verify skb data can be read */
+SEC("test_data")
+int do_test_data(struct __sk_buff *skb)
+{
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+ struct iphdr *iph = data;
+
+ if (data + sizeof(*iph) > data_end) {
+ printk("packet truncated");
+ return BPF_DROP;
+ }
+
+ printk("src: %x dst: %x", iph->saddr, iph->daddr);
+
+ return BPF_OK;
+}
+
+#define IP_CSUM_OFF offsetof(struct iphdr, check)
+#define IP_DST_OFF offsetof(struct iphdr, daddr)
+#define IP_SRC_OFF offsetof(struct iphdr, saddr)
+#define IP_PROTO_OFF offsetof(struct iphdr, protocol)
+#define TCP_CSUM_OFF offsetof(struct tcphdr, check)
+#define UDP_CSUM_OFF offsetof(struct udphdr, check)
+#define IS_PSEUDO 0x10
+
+static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip,
+ uint32_t new_ip, int rw_daddr)
+{
+ int ret, off = 0, flags = IS_PSEUDO;
+ uint8_t proto;
+
+ ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1);
+ if (ret < 0) {
+ printk("bpf_l4_csum_replace failed: %d", ret);
+ return BPF_DROP;
+ }
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ off = TCP_CSUM_OFF;
+ break;
+
+ case IPPROTO_UDP:
+ off = UDP_CSUM_OFF;
+ flags |= BPF_F_MARK_MANGLED_0;
+ break;
+
+ case IPPROTO_ICMPV6:
+ off = offsetof(struct icmp6hdr, icmp6_cksum);
+ break;
+ }
+
+ if (off) {
+ ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip,
+ flags | sizeof(new_ip));
+ if (ret < 0) {
+ printk("bpf_l4_csum_replace failed: %d");
+ return BPF_DROP;
+ }
+ }
+
+ ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+ if (ret < 0) {
+ printk("bpf_l3_csum_replace failed: %d", ret);
+ return BPF_DROP;
+ }
+
+ if (rw_daddr)
+ ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0);
+ else
+ ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+
+ if (ret < 0) {
+ printk("bpf_skb_store_bytes() failed: %d", ret);
+ return BPF_DROP;
+ }
+
+ return BPF_OK;
+}
+
+/* Test: Verify skb data can be modified */
+SEC("test_rewrite")
+int do_test_rewrite(struct __sk_buff *skb)
+{
+ uint32_t old_ip, new_ip = 0x3fea8c0;
+ int ret;
+
+ ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4);
+ if (ret < 0) {
+ printk("bpf_skb_load_bytes failed: %d", ret);
+ return BPF_DROP;
+ }
+
+ if (old_ip == 0x2fea8c0) {
+ printk("out: rewriting from %x to %x", old_ip, new_ip);
+ return rewrite(skb, old_ip, new_ip, 1);
+ }
+
+ return BPF_OK;
+}
+
+static inline int __do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+ uint64_t smac = SRC_MAC, dmac = DST_MAC;
+ int ret, ifindex = DST_IFINDEX;
+ struct ethhdr ehdr;
+
+ ret = bpf_skb_change_head(skb, 14, 0);
+ if (ret < 0) {
+ printk("skb_change_head() failed: %d", ret);
+ }
+
+ ehdr.h_proto = bpf_htons(ETH_P_IP);
+ memcpy(&ehdr.h_source, &smac, 6);
+ memcpy(&ehdr.h_dest, &dmac, 6);
+
+ ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0);
+ if (ret < 0) {
+ printk("skb_store_bytes() failed: %d", ret);
+ return BPF_DROP;
+ }
+
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("push_ll_and_redirect_silent")
+int do_push_ll_and_redirect_silent(struct __sk_buff *skb)
+{
+ return __do_push_ll_and_redirect(skb);
+}
+
+SEC("push_ll_and_redirect")
+int do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+ int ret, ifindex = DST_IFINDEX;
+
+ ret = __do_push_ll_and_redirect(skb);
+ if (ret >= 0)
+ printk("redirected to %d", ifindex);
+
+ return ret;
+}
+
+static inline void __fill_garbage(struct __sk_buff *skb)
+{
+ uint64_t f = 0xFFFFFFFFFFFFFFFF;
+
+ bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0);
+ bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0);
+}
+
+SEC("fill_garbage")
+int do_fill_garbage(struct __sk_buff *skb)
+{
+ __fill_garbage(skb);
+ printk("Set initial 96 bytes of header to FF");
+ return BPF_OK;
+}
+
+SEC("fill_garbage_and_redirect")
+int do_fill_garbage_and_redirect(struct __sk_buff *skb)
+{
+ int ifindex = DST_IFINDEX;
+ __fill_garbage(skb);
+ printk("redirected to %d", ifindex);
+ return bpf_redirect(ifindex, 0);
+}
+
+/* Drop all packets */
+SEC("drop_all")
+int do_drop_all(struct __sk_buff *skb)
+{
+ printk("dropping with: %d", BPF_DROP);
+ return BPF_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
new file mode 100755
index 000000000000..148e2df6cdce
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -0,0 +1,405 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Uncomment to see generated bytecode
+#VERBOSE=verbose
+
+NS1=lwt_ns1
+NS2=lwt_ns2
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+VETH2=tst_lwt2a
+VETH3=tst_lwt2b
+IPVETH0="192.168.254.1"
+IPVETH1="192.168.254.2"
+IPVETH1b="192.168.254.3"
+
+IPVETH2="192.168.111.1"
+IPVETH3="192.168.111.2"
+
+IP_LOCAL="192.168.99.1"
+
+PROG_SRC="test_lwt_bpf.c"
+BPF_PROG="test_lwt_bpf.o"
+TRACE_ROOT=/sys/kernel/tracing
+CONTEXT_INFO=$(cat ${TRACE_ROOT}/trace_options | grep context)
+
+function lookup_mac()
+{
+ set +x
+ if [ ! -z "$2" ]; then
+ MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}')
+ else
+ MAC=$(ip link show $1 | grep ether | awk '{print $2}')
+ fi
+ MAC="${MAC//:/}"
+ echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}"
+ set -x
+}
+
+function cleanup {
+ set +ex
+ rm $BPF_PROG 2> /dev/null
+ ip link del $VETH0 2> /dev/null
+ ip link del $VETH1 2> /dev/null
+ ip link del $VETH2 2> /dev/null
+ ip link del $VETH3 2> /dev/null
+ ip netns exec $NS1 killall netserver
+ ip netns delete $NS1 2> /dev/null
+ ip netns delete $NS2 2> /dev/null
+ set -ex
+}
+
+function setup_one_veth {
+ ip netns add $1
+ ip link add $2 type veth peer name $3
+ ip link set dev $2 up
+ ip addr add $4/24 dev $2
+ ip link set $3 netns $1
+ ip netns exec $1 ip link set dev $3 up
+ ip netns exec $1 ip addr add $5/24 dev $3
+
+ if [ "$6" ]; then
+ ip netns exec $1 ip addr add $6/32 dev $3
+ fi
+}
+
+function get_trace {
+ set +x
+ cat ${TRACE_ROOT}/trace | grep -v '^#'
+ set -x
+}
+
+function cleanup_routes {
+ ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true
+ ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true
+}
+
+function install_test {
+ cleanup_routes
+ cp /dev/null ${TRACE_ROOT}/trace
+
+ OPTS="encap bpf headroom 14 $1 obj $BPF_PROG section $2 $VERBOSE"
+
+ if [ "$1" == "in" ]; then
+ ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo
+ else
+ ip route add ${IPVETH1}/32 $OPTS dev $VETH0
+ fi
+}
+
+function remove_prog {
+ if [ "$1" == "in" ]; then
+ ip route del table local local ${IP_LOCAL}/32 dev lo
+ else
+ ip route del ${IPVETH1}/32 dev $VETH0
+ fi
+}
+
+function filter_trace {
+ # Add newline to allow starting EXPECT= variables on newline
+ NL=$'\n'
+ echo "${NL}$*" | sed -e 's/bpf_trace_printk: //g'
+}
+
+function expect_fail {
+ set +x
+ echo "FAIL:"
+ echo "Expected: $1"
+ echo "Got: $2"
+ set -x
+ exit 1
+}
+
+function match_trace {
+ set +x
+ RET=0
+ TRACE=$1
+ EXPECT=$2
+ GOT="$(filter_trace "$TRACE")"
+
+ [ "$GOT" != "$EXPECT" ] && {
+ expect_fail "$EXPECT" "$GOT"
+ RET=1
+ }
+ set -x
+ return $RET
+}
+
+function test_start {
+ set +x
+ echo "----------------------------------------------------------------"
+ echo "Starting test: $*"
+ echo "----------------------------------------------------------------"
+ set -x
+}
+
+function failure {
+ get_trace
+ echo "FAIL: $*"
+ exit 1
+}
+
+function test_ctx_xmit {
+ test_start "test_ctx on lwt xmit"
+ install_test xmit test_ctx
+ ping -c 3 $IPVETH1 || {
+ failure "test_ctx xmit: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_ctx_out {
+ test_start "test_ctx on lwt out"
+ install_test out test_ctx
+ ping -c 3 $IPVETH1 || {
+ failure "test_ctx out: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex 0" || exit 1
+ remove_prog out
+}
+
+function test_ctx_in {
+ test_start "test_ctx on lwt in"
+ install_test in test_ctx
+ ping -c 3 $IP_LOCAL || {
+ failure "test_ctx out: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1" || exit 1
+ remove_prog in
+}
+
+function test_data {
+ test_start "test_data on lwt $1"
+ install_test $1 test_data
+ ping -c 3 $IPVETH1 || {
+ failure "test_data ${1}: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0" || exit 1
+ remove_prog $1
+}
+
+function test_data_in {
+ test_start "test_data on lwt in"
+ install_test in test_data
+ ping -c 3 $IP_LOCAL || {
+ failure "test_data in: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0" || exit 1
+ remove_prog in
+}
+
+function test_cb {
+ test_start "test_cb on lwt $1"
+ install_test $1 test_cb
+ ping -c 3 $IPVETH1 || {
+ failure "test_cb ${1}: packets are dropped"
+ }
+ match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+ remove_prog $1
+}
+
+function test_cb_in {
+ test_start "test_cb on lwt in"
+ install_test in test_cb
+ ping -c 3 $IP_LOCAL || {
+ failure "test_cb in: packets are dropped"
+ }
+ # We will both request & reply packets as the packets will
+ # be from $IP_LOCAL => $IP_LOCAL
+ match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+ remove_prog in
+}
+
+function test_drop_all {
+ test_start "test_drop_all on lwt $1"
+ install_test $1 drop_all
+ ping -c 3 $IPVETH1 && {
+ failure "test_drop_all ${1}: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+ remove_prog $1
+}
+
+function test_drop_all_in {
+ test_start "test_drop_all on lwt in"
+ install_test in drop_all
+ ping -c 3 $IP_LOCAL && {
+ failure "test_drop_all in: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+ remove_prog in
+}
+
+function test_push_ll_and_redirect {
+ test_start "test_push_ll_and_redirect on lwt xmit"
+ install_test xmit push_ll_and_redirect
+ ping -c 3 $IPVETH1 || {
+ failure "Redirected packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_no_l2_and_redirect {
+ test_start "test_no_l2_and_redirect on lwt xmit"
+ install_test xmit fill_garbage_and_redirect
+ ping -c 3 $IPVETH1 && {
+ failure "Unexpected success despite lack of L2 header"
+ }
+ match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+ remove_prog xmit
+}
+
+function test_rewrite {
+ test_start "test_rewrite on lwt xmit"
+ install_test xmit test_rewrite
+ ping -c 3 $IPVETH1 || {
+ failure "Rewritten packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" "
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0" || exit 1
+ remove_prog out
+}
+
+function test_fill_garbage {
+ test_start "test_fill_garbage on lwt xmit"
+ install_test xmit fill_garbage
+ ping -c 3 $IPVETH1 && {
+ failure "test_drop_all ${1}: Unexpected success of ping"
+ }
+ match_trace "$(get_trace)" "
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF" || exit 1
+ remove_prog xmit
+}
+
+function test_netperf_nop {
+ test_start "test_netperf_nop on lwt xmit"
+ install_test xmit nop
+ netperf -H $IPVETH1 -t TCP_STREAM || {
+ failure "packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" ""|| exit 1
+ remove_prog xmit
+}
+
+function test_netperf_redirect {
+ test_start "test_netperf_redirect on lwt xmit"
+ install_test xmit push_ll_and_redirect_silent
+ netperf -H $IPVETH1 -t TCP_STREAM || {
+ failure "Rewritten packets appear to be dropped"
+ }
+ match_trace "$(get_trace)" ""|| exit 1
+ remove_prog xmit
+}
+
+cleanup
+setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b
+setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3
+ip netns exec $NS1 netserver
+echo 1 > ${TRACE_ROOT}/tracing_on
+echo nocontext-info > ${TRACE_ROOT}/trace_options
+
+DST_MAC=$(lookup_mac $VETH1 $NS1)
+SRC_MAC=$(lookup_mac $VETH0)
+DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex)
+
+CLANG_OPTS="-O2 --target=bpf -I ../include/"
+CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX"
+clang $CLANG_OPTS -c $PROG_SRC -o $BPF_PROG
+
+test_ctx_xmit
+test_ctx_out
+test_ctx_in
+test_data "xmit"
+test_data "out"
+test_data_in
+test_cb "xmit"
+test_cb "out"
+test_cb_in
+test_drop_all "xmit"
+test_drop_all "out"
+test_drop_all_in
+test_rewrite
+test_push_ll_and_redirect
+test_no_l2_and_redirect
+test_fill_garbage
+test_netperf_nop
+test_netperf_redirect
+
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+echo $CONTEXT_INFO > ${TRACE_ROOT}/trace_options
+exit 0
diff --git a/samples/bpf/test_map_in_map.bpf.c b/samples/bpf/test_map_in_map.bpf.c
new file mode 100644
index 000000000000..9f030f9c4e1b
--- /dev/null
+++ b/samples/bpf/test_map_in_map.bpf.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define MAX_NR_PORTS 65536
+
+#define EINVAL 22
+#define ENOENT 2
+
+/* map #0 */
+struct inner_a {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, int);
+ __uint(max_entries, MAX_NR_PORTS);
+} port_a SEC(".maps");
+
+/* map #1 */
+struct inner_h {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, int);
+ __uint(max_entries, 1);
+} port_h SEC(".maps");
+
+/* map #2 */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, int);
+ __uint(max_entries, 1);
+} reg_result_h SEC(".maps");
+
+/* map #3 */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, int);
+ __uint(max_entries, 1);
+} inline_result_h SEC(".maps");
+
+/* map #4 */ /* Test case #0 */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, MAX_NR_PORTS);
+ __uint(key_size, sizeof(u32));
+ __array(values, struct inner_a); /* use inner_a as inner map */
+} a_of_port_a SEC(".maps");
+
+/* map #5 */ /* Test case #1 */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(u32));
+ __array(values, struct inner_a); /* use inner_a as inner map */
+} h_of_port_a SEC(".maps");
+
+/* map #6 */ /* Test case #2 */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(u32));
+ __array(values, struct inner_h); /* use inner_h as inner map */
+} h_of_port_h SEC(".maps");
+
+static __always_inline int do_reg_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ result = bpf_map_lookup_elem(inner_map, &port);
+ return result ? *result : -ENOENT;
+}
+
+static __always_inline int do_inline_array_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ if (inner_map != &port_a)
+ return -EINVAL;
+
+ result = bpf_map_lookup_elem(&port_a, &port);
+ return result ? *result : -ENOENT;
+}
+
+static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port)
+{
+ int *result;
+
+ if (inner_map != &port_h)
+ return -EINVAL;
+
+ result = bpf_map_lookup_elem(&port_h, &port);
+ return result ? *result : -ENOENT;
+}
+
+SEC("ksyscall/connect")
+int BPF_KSYSCALL(trace_sys_connect, unsigned int fd, struct sockaddr_in6 *in6, int addrlen)
+{
+ u16 test_case, port, dst6[8];
+ int ret, inline_ret, ret_key = 0;
+ u32 port_key;
+ void *outer_map, *inner_map;
+ bool inline_hash = false;
+
+ if (addrlen != sizeof(*in6))
+ return 0;
+
+ ret = bpf_probe_read_user(dst6, sizeof(dst6), &in6->sin6_addr);
+ if (ret) {
+ inline_ret = ret;
+ goto done;
+ }
+
+ if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
+ return 0;
+
+ test_case = dst6[7];
+
+ ret = bpf_probe_read_user(&port, sizeof(port), &in6->sin6_port);
+ if (ret) {
+ inline_ret = ret;
+ goto done;
+ }
+
+ port_key = port;
+
+ ret = -ENOENT;
+ if (test_case == 0) {
+ outer_map = &a_of_port_a;
+ } else if (test_case == 1) {
+ outer_map = &h_of_port_a;
+ } else if (test_case == 2) {
+ outer_map = &h_of_port_h;
+ } else {
+ ret = __LINE__;
+ inline_ret = ret;
+ goto done;
+ }
+
+ inner_map = bpf_map_lookup_elem(outer_map, &port_key);
+ if (!inner_map) {
+ ret = __LINE__;
+ inline_ret = ret;
+ goto done;
+ }
+
+ ret = do_reg_lookup(inner_map, port_key);
+
+ if (test_case == 0 || test_case == 1)
+ inline_ret = do_inline_array_lookup(inner_map, port_key);
+ else
+ inline_ret = do_inline_hash_lookup(inner_map, port_key);
+
+done:
+ bpf_map_update_elem(&reg_result_h, &ret_key, &ret, BPF_ANY);
+ bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c
new file mode 100644
index 000000000000..55dca43f3723
--- /dev/null
+++ b/samples/bpf/test_map_in_map_user.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2017 Facebook
+ */
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+
+static int map_fd[7];
+
+#define PORT_A (map_fd[0])
+#define PORT_H (map_fd[1])
+#define REG_RESULT_H (map_fd[2])
+#define INLINE_RESULT_H (map_fd[3])
+#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */
+#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */
+#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */
+
+static const char * const test_names[] = {
+ "Array of Array",
+ "Hash of Array",
+ "Hash of Hash",
+};
+
+#define NR_TESTS ARRAY_SIZE(test_names)
+
+static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key)
+{
+ struct bpf_map_info info = {};
+ uint32_t info_len = sizeof(info);
+ int ret, id;
+
+ ret = bpf_map_get_info_by_fd(inner_map_fd, &info, &info_len);
+ assert(!ret);
+
+ ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id);
+ assert(!ret);
+ assert(id == info.id);
+}
+
+static void populate_map(uint32_t port_key, int magic_result)
+{
+ int ret;
+
+ ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY);
+ assert(!ret);
+
+ ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result,
+ BPF_NOEXIST);
+ assert(!ret);
+
+ ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY);
+ assert(!ret);
+ check_map_id(PORT_A, A_OF_PORT_A, port_key);
+
+ ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST);
+ assert(!ret);
+ check_map_id(PORT_A, H_OF_PORT_A, port_key);
+
+ ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST);
+ assert(!ret);
+ check_map_id(PORT_H, H_OF_PORT_H, port_key);
+}
+
+static void test_map_in_map(void)
+{
+ struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
+ uint32_t result_key = 0, port_key;
+ int result, inline_result;
+ int magic_result = 0xfaceb00c;
+ int ret;
+ int i;
+
+ port_key = rand() & 0x00FF;
+ populate_map(port_key, magic_result);
+
+ in6.sin6_addr.s6_addr16[0] = 0xdead;
+ in6.sin6_addr.s6_addr16[1] = 0xbeef;
+ in6.sin6_port = port_key;
+
+ for (i = 0; i < NR_TESTS; i++) {
+ printf("%s: ", test_names[i]);
+
+ in6.sin6_addr.s6_addr16[7] = i;
+ ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6));
+ assert(ret == -1 && errno == EBADF);
+
+ ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result);
+ assert(!ret);
+
+ ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key,
+ &inline_result);
+ assert(!ret);
+
+ if (result != magic_result || inline_result != magic_result) {
+ printf("Error. result:%d inline_result:%d\n",
+ result, inline_result);
+ exit(1);
+ }
+
+ bpf_map_delete_elem(REG_RESULT_H, &result_key);
+ bpf_map_delete_elem(INLINE_RESULT_H, &result_key);
+
+ printf("Pass\n");
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "trace_sys_connect");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h");
+ map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h");
+ map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h");
+ map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a");
+ map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a");
+ map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h");
+ if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 ||
+ map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ test_map_in_map();
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
new file mode 100644
index 000000000000..0bba5fcd7d24
--- /dev/null
+++ b/samples/bpf/trace_event_kern.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <uapi/linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct key_t {
+ char comm[TASK_COMM_LEN];
+ u32 kernstack;
+ u32 userstack;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct key_t);
+ __type(value, u64);
+ __uint(max_entries, 10000);
+} counts SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
+ __uint(max_entries, 10000);
+} stackmap SEC(".maps");
+
+#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
+#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
+
+SEC("perf_event")
+int bpf_prog1(struct bpf_perf_event_data *ctx)
+{
+ char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+ char time_fmt2[] = "Get Time Failed, ErrCode: %d";
+ char addr_fmt[] = "Address recorded on event: %llx";
+ char fmt[] = "CPU-%d period %lld ip %llx";
+ u32 cpu = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value value_buf;
+ struct key_t key;
+ u64 *val, one = 1;
+ int ret;
+
+ if (ctx->sample_period < 10000)
+ /* ignore warmup */
+ return 0;
+ bpf_get_current_comm(&key.comm, sizeof(key.comm));
+ key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
+ key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
+ if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
+ bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
+ PT_REGS_IP(&ctx->regs));
+ return 0;
+ }
+
+ ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+ if (!ret)
+ bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+ else
+ bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
+ if (ctx->addr != 0)
+ bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr);
+
+ val = bpf_map_lookup_elem(&counts, &key);
+ if (val)
+ (*val)++;
+ else
+ bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
new file mode 100644
index 000000000000..9664749bf618
--- /dev/null
+++ b/samples/bpf/trace_event_user.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/resource.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define SAMPLE_FREQ 50
+
+static int pid;
+/* counts, stackmap */
+static int map_fd[2];
+struct bpf_program *prog;
+static bool sys_read_seen, sys_write_seen;
+
+static void print_ksym(__u64 addr)
+{
+ struct ksym *sym;
+
+ if (!addr)
+ return;
+ sym = ksym_search(addr);
+ if (!sym) {
+ printf("ksym not found. Is kallsyms loaded?\n");
+ return;
+ }
+
+ printf("%s;", sym->name);
+ if (!strstr(sym->name, "sys_read"))
+ sys_read_seen = true;
+ else if (!strstr(sym->name, "sys_write"))
+ sys_write_seen = true;
+}
+
+static void print_addr(__u64 addr)
+{
+ if (!addr)
+ return;
+ printf("%llx;", addr);
+}
+
+#define TASK_COMM_LEN 16
+
+struct key_t {
+ char comm[TASK_COMM_LEN];
+ __u32 kernstack;
+ __u32 userstack;
+};
+
+static void print_stack(struct key_t *key, __u64 count)
+{
+ __u64 ip[PERF_MAX_STACK_DEPTH] = {};
+ static bool warned;
+ int i;
+
+ printf("%3lld %s;", count, key->comm);
+ if (bpf_map_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_ksym(ip[i]);
+ }
+ printf("-;");
+ if (bpf_map_lookup_elem(map_fd[1], &key->userstack, ip) != 0) {
+ printf("---;");
+ } else {
+ for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
+ print_addr(ip[i]);
+ }
+ if (count < 6)
+ printf("\r");
+ else
+ printf("\n");
+
+ if (key->kernstack == -EEXIST && !warned) {
+ printf("stackmap collisions seen. Consider increasing size\n");
+ warned = true;
+ } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) {
+ printf("err stackid %d %d\n", key->kernstack, key->userstack);
+ }
+}
+
+static void err_exit(int err)
+{
+ kill(pid, SIGKILL);
+ exit(err);
+}
+
+static void print_stacks(void)
+{
+ struct key_t key = {}, next_key;
+ __u64 value;
+ __u32 stackid = 0, next_id;
+ int error = 1, fd = map_fd[0], stack_map = map_fd[1];
+
+ sys_read_seen = sys_write_seen = false;
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &value);
+ print_stack(&next_key, value);
+ bpf_map_delete_elem(fd, &next_key);
+ key = next_key;
+ }
+ printf("\n");
+ if (!sys_read_seen || !sys_write_seen) {
+ printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n");
+ err_exit(error);
+ }
+
+ /* clear stack map */
+ while (bpf_map_get_next_key(stack_map, &stackid, &next_id) == 0) {
+ bpf_map_delete_elem(stack_map, &next_id);
+ stackid = next_id;
+ }
+}
+
+static inline int generate_load(void)
+{
+ if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) {
+ printf("failed to generate some load with dd: %s\n", strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void test_perf_event_all_cpu(struct perf_event_attr *attr)
+{
+ int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *));
+ int i, pmu_fd, error = 1;
+
+ if (!links) {
+ printf("malloc of links failed\n");
+ goto err;
+ }
+
+ /* system wide perf event, no need to inherit */
+ attr->inherit = 0;
+
+ /* open perf_event on all cpus */
+ for (i = 0; i < nr_cpus; i++) {
+ pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0);
+ if (pmu_fd < 0) {
+ printf("sys_perf_event_open failed\n");
+ goto all_cpu_err;
+ }
+ links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
+ if (libbpf_get_error(links[i])) {
+ printf("bpf_program__attach_perf_event failed\n");
+ links[i] = NULL;
+ close(pmu_fd);
+ goto all_cpu_err;
+ }
+ }
+
+ if (generate_load() < 0)
+ goto all_cpu_err;
+
+ print_stacks();
+ error = 0;
+all_cpu_err:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+err:
+ free(links);
+ if (error)
+ err_exit(error);
+}
+
+static void test_perf_event_task(struct perf_event_attr *attr)
+{
+ struct bpf_link *link = NULL;
+ int pmu_fd, error = 1;
+
+ /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+ * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+ */
+ attr->inherit = 1;
+
+ /* open task bound event */
+ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
+ if (pmu_fd < 0) {
+ printf("sys_perf_event_open failed\n");
+ goto err;
+ }
+ link = bpf_program__attach_perf_event(prog, pmu_fd);
+ if (libbpf_get_error(link)) {
+ printf("bpf_program__attach_perf_event failed\n");
+ link = NULL;
+ close(pmu_fd);
+ goto err;
+ }
+
+ if (generate_load() < 0)
+ goto err;
+
+ print_stacks();
+ error = 0;
+err:
+ bpf_link__destroy(link);
+ if (error)
+ err_exit(error);
+}
+
+static void test_bpf_perf_event(void)
+{
+ struct perf_event_attr attr_type_hw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ struct perf_event_attr attr_type_sw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+ struct perf_event_attr attr_hw_cache_l1d = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HW_CACHE,
+ .config =
+ PERF_COUNT_HW_CACHE_L1D |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
+ };
+ struct perf_event_attr attr_hw_cache_branch_miss = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_HW_CACHE,
+ .config =
+ PERF_COUNT_HW_CACHE_BPU |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+ };
+ struct perf_event_attr attr_type_raw = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_RAW,
+ /* Intel Instruction Retired */
+ .config = 0xc0,
+ };
+ struct perf_event_attr attr_type_raw_lock_load = {
+ .sample_freq = SAMPLE_FREQ,
+ .freq = 1,
+ .type = PERF_TYPE_RAW,
+ /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */
+ .config = 0x21d0,
+ /* Request to record lock address from PEBS */
+ .sample_type = PERF_SAMPLE_ADDR,
+ /* Record address value requires precise event */
+ .precise_ip = 2,
+ };
+
+ printf("Test HW_CPU_CYCLES\n");
+ test_perf_event_all_cpu(&attr_type_hw);
+ test_perf_event_task(&attr_type_hw);
+
+ printf("Test SW_CPU_CLOCK\n");
+ test_perf_event_all_cpu(&attr_type_sw);
+ test_perf_event_task(&attr_type_sw);
+
+ printf("Test HW_CACHE_L1D\n");
+ test_perf_event_all_cpu(&attr_hw_cache_l1d);
+ test_perf_event_task(&attr_hw_cache_l1d);
+
+ printf("Test HW_CACHE_BPU\n");
+ test_perf_event_all_cpu(&attr_hw_cache_branch_miss);
+ test_perf_event_task(&attr_hw_cache_branch_miss);
+
+ printf("Test Instruction Retired\n");
+ test_perf_event_all_cpu(&attr_type_raw);
+ test_perf_event_task(&attr_type_raw);
+
+ printf("Test Lock Load\n");
+ test_perf_event_all_cpu(&attr_type_raw_lock_load);
+ test_perf_event_task(&attr_type_raw_lock_load);
+
+ printf("*** PASS ***\n");
+}
+
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj = NULL;
+ char filename[256];
+ int error = 1;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ signal(SIGINT, err_exit);
+ signal(SIGTERM, err_exit);
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ goto cleanup;
+ }
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ printf("opening BPF object file failed\n");
+ obj = NULL;
+ goto cleanup;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ printf("loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap");
+ if (map_fd[0] < 0 || map_fd[1] < 0) {
+ printf("finding a counts/stackmap map in obj file failed\n");
+ goto cleanup;
+ }
+
+ pid = fork();
+ if (pid == 0) {
+ read_trace_pipe();
+ return 0;
+ } else if (pid == -1) {
+ printf("couldn't spawn process\n");
+ goto cleanup;
+ }
+
+ test_bpf_perf_event();
+ error = 0;
+
+cleanup:
+ bpf_object__close(obj);
+ err_exit(error);
+}
diff --git a/samples/bpf/trace_output.bpf.c b/samples/bpf/trace_output.bpf.c
new file mode 100644
index 000000000000..565a73b51b04
--- /dev/null
+++ b/samples/bpf/trace_output.bpf.c
@@ -0,0 +1,29 @@
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 2);
+} my_map SEC(".maps");
+
+SEC("ksyscall/write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ struct S {
+ u64 pid;
+ u64 cookie;
+ } data;
+
+ data.pid = bpf_get_current_pid_tgid();
+ data.cookie = 0x12345678;
+
+ bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
new file mode 100644
index 000000000000..d316fd2c8e24
--- /dev/null
+++ b/samples/bpf/trace_output_user.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <stdio.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <time.h>
+#include <signal.h>
+#include <bpf/libbpf.h>
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static __u64 start_time;
+static __u64 cnt;
+
+#define MAX_CNT 100000ll
+
+static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size)
+{
+ struct {
+ __u64 pid;
+ __u64 cookie;
+ } *e = data;
+
+ if (e->cookie != 0x12345678) {
+ printf("BUG pid %llx cookie %llx sized %d\n",
+ e->pid, e->cookie, size);
+ return;
+ }
+
+ cnt++;
+
+ if (cnt == MAX_CNT) {
+ printf("recv %lld events per sec\n",
+ MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+ return;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct perf_buffer *pb;
+ struct bpf_object *obj;
+ int map_fd, ret = 0;
+ char filename[256];
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (libbpf_get_error(prog)) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL);
+ ret = libbpf_get_error(pb);
+ if (ret) {
+ printf("failed to setup perf_buffer: %d\n", ret);
+ return 1;
+ }
+
+ f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+
+ start_time = time_get_ns();
+ while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) {
+ }
+ kill(0, SIGINT);
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+ return ret;
+}
diff --git a/samples/bpf/tracex1.bpf.c b/samples/bpf/tracex1.bpf.c
new file mode 100644
index 000000000000..ceedf0b1d479
--- /dev/null
+++ b/samples/bpf/tracex1.bpf.c
@@ -0,0 +1,47 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include "net_shared.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ */
+SEC("kprobe.multi/__netif_receive_skb_core*")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ /* attaches to kprobe __netif_receive_skb_core,
+ * looks for packets on loopback device and prints them
+ * (wildcard is used for avoiding symbol mismatch due to optimization)
+ */
+ char devname[IFNAMSIZ];
+ struct net_device *dev;
+ struct sk_buff *skb;
+ int len;
+
+ bpf_core_read(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx));
+ dev = BPF_CORE_READ(skb, dev);
+ len = BPF_CORE_READ(skb, len);
+
+ BPF_CORE_READ_STR_INTO(&devname, dev, name);
+
+ if (devname[0] == 'l' && devname[1] == 'o') {
+ char fmt[] = "skb %p len %d\n";
+ /* using bpf_trace_printk() for DEBUG ONLY */
+ bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..8c3d9043a2b6
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <unistd.h>
+#include <bpf/libbpf.h>
+#include "trace_helpers.h"
+
+int main(int ac, char **argv)
+{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ f = popen("taskset 1 ping -c5 localhost", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/tracex3.bpf.c b/samples/bpf/tracex3.bpf.c
new file mode 100644
index 000000000000..41f37966f5f5
--- /dev/null
+++ b/samples/bpf/tracex3.bpf.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, u64);
+ __uint(max_entries, 4096);
+} my_map SEC(".maps");
+
+/* from /sys/kernel/tracing/events/block/block_io_start/format */
+SEC("tracepoint/block/block_io_start")
+int bpf_prog1(struct trace_event_raw_block_rq *ctx)
+{
+ u64 val = bpf_ktime_get_ns();
+ struct start_key key = {
+ .dev = ctx->dev,
+ .sector = ctx->sector
+ };
+
+ bpf_map_update_elem(&my_map, &key, &val, BPF_ANY);
+ return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+ int i = -(n == 0);
+ S(32); S(16); S(8); S(4); S(2); S(1);
+ return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, SLOTS);
+} lat_map SEC(".maps");
+
+/* from /sys/kernel/tracing/events/block/block_io_done/format */
+SEC("tracepoint/block/block_io_done")
+int bpf_prog2(struct trace_event_raw_block_rq *ctx)
+{
+ struct start_key key = {
+ .dev = ctx->dev,
+ .sector = ctx->sector
+ };
+
+ u64 *value, l, base;
+ u32 index;
+
+ value = bpf_map_lookup_elem(&my_map, &key);
+ if (!value)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = cur_time - *value;
+
+ bpf_map_delete_elem(&my_map, &key);
+
+ /* the lines below are computing index = log10(delta)*10
+ * using integer arithmetic
+ * index = 29 ~ 1 usec
+ * index = 59 ~ 1 msec
+ * index = 89 ~ 1 sec
+ * index = 99 ~ 10sec or more
+ * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+ */
+ l = log2l(delta);
+ base = 1ll << l;
+ index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+ if (index >= SLOTS)
+ index = SLOTS - 1;
+
+ value = bpf_map_lookup_elem(&lat_map, &index);
+ if (value)
+ *value += 1;
+
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..1002eb0323b4
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_util.h"
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 values[nr_cpus];
+ __u32 key;
+
+ memset(values, 0, sizeof(values));
+ for (key = 0; key < SLOTS; key++)
+ bpf_map_update_elem(fd, &key, values, BPF_ANY);
+}
+
+const char *color[] = {
+ "\033[48;5;255m",
+ "\033[48;5;252m",
+ "\033[48;5;250m",
+ "\033[48;5;248m",
+ "\033[48;5;246m",
+ "\033[48;5;244m",
+ "\033[48;5;242m",
+ "\033[48;5;240m",
+ "\033[48;5;238m",
+ "\033[48;5;236m",
+ "\033[48;5;234m",
+ "\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+ " ",
+ " ",
+ ".",
+ ".",
+ "*",
+ "*",
+ "o",
+ "o",
+ "O",
+ "O",
+ "#",
+ "#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+ if (full_range)
+ printf("|1ns |10ns |100ns |1us |10us |100us"
+ " |1ms |10ms |100ms |1s |10s\n");
+ else
+ printf("|1us |10us |100us |1ms |10ms "
+ "|100ms |1s |10s\n");
+}
+
+static void print_hist(int fd)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 total_events = 0;
+ long values[nr_cpus];
+ __u64 max_cnt = 0;
+ __u64 cnt[SLOTS];
+ __u64 value;
+ __u32 key;
+ int i;
+
+ for (key = 0; key < SLOTS; key++) {
+ bpf_map_lookup_elem(fd, &key, values);
+ value = 0;
+ for (i = 0; i < nr_cpus; i++)
+ value += values[i];
+ cnt[key] = value;
+ total_events += value;
+ if (value > max_cnt)
+ max_cnt = value;
+ }
+ clear_stats(fd);
+ for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+ int c = num_colors * cnt[key] / (max_cnt + 1);
+
+ if (text_only)
+ printf("%s", sym[c]);
+ else
+ printf("%s %s", color[c], nocolor);
+ }
+ printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int map_fd, i, j = 0;
+
+ for (i = 1; i < ac; i++) {
+ if (strcmp(argv[i], "-a") == 0) {
+ full_range = true;
+ } else if (strcmp(argv[i], "-t") == 0) {
+ text_only = true;
+ } else if (strcmp(argv[i], "-h") == 0) {
+ printf("Usage:\n"
+ " -a display wider latency range\n"
+ " -t text only\n");
+ return 1;
+ }
+ }
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+
+ printf(" heatmap of IO latency\n");
+ if (text_only)
+ printf(" %s", sym[num_colors - 1]);
+ else
+ printf(" %s %s", color[num_colors - 1], nocolor);
+ printf(" - many events with this latency\n");
+
+ if (text_only)
+ printf(" %s", sym[0]);
+ else
+ printf(" %s %s", color[0], nocolor);
+ printf(" - few events\n");
+
+ for (i = 0; ; i++) {
+ if (i % 20 == 0)
+ print_banner();
+ print_hist(map_fd);
+ sleep(2);
+ }
+
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/tracex4.bpf.c b/samples/bpf/tracex4.bpf.c
new file mode 100644
index 000000000000..d786492fd926
--- /dev/null
+++ b/samples/bpf/tracex4.bpf.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct pair {
+ u64 val;
+ u64 ip;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, long);
+ __type(value, struct pair);
+ __uint(max_entries, 1000000);
+} my_map SEC(".maps");
+
+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
+ * example will no longer be meaningful
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ long ptr = PT_REGS_PARM2(ctx);
+
+ bpf_map_delete_elem(&my_map, &ptr);
+ return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node_noprof")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long ptr = PT_REGS_RC(ctx);
+ long ip = 0;
+
+ /* get ip address of kmem_cache_alloc_node_noprof() caller */
+ BPF_KRETPROBE_READ_RET_IP(ip, ctx);
+
+ struct pair v = {
+ .val = bpf_ktime_get_ns(),
+ .ip = ip,
+ };
+
+ bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..a5145ad72cbf
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+struct pair {
+ long long val;
+ __u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+ long long val = time_get_ns();
+ __u64 key, next_key;
+ struct pair v;
+
+ key = write(1, "\e[1;1H\e[2J", 11); /* clear screen */
+
+ key = -1;
+ while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(fd, &next_key, &v);
+ key = next_key;
+ if (val - v.val < 1000000000ll)
+ /* object was allocated more then 1 sec ago */
+ continue;
+ printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+ next_key, (val - v.val) / 1000000000ll, v.ip);
+ }
+}
+
+int main(int ac, char **argv)
+{
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int map_fd, j = 0;
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
+ if (map_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[j] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[j])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[j] = NULL;
+ goto cleanup;
+ }
+ j++;
+ }
+
+ while (1) {
+ print_old_objects(map_fd);
+ sleep(1);
+ }
+
+cleanup:
+ for (j--; j >= 0; j--)
+ bpf_link__destroy(links[j]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/tracex5.bpf.c b/samples/bpf/tracex5.bpf.c
new file mode 100644
index 000000000000..4d3d6c9b25fa
--- /dev/null
+++ b/samples/bpf/tracex5.bpf.c
@@ -0,0 +1,93 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "vmlinux.h"
+#include "syscall_nrs.h"
+#include <linux/version.h>
+#include <uapi/linux/unistd.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define __stringify(x) #x
+#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u32));
+#ifdef __mips__
+ __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
+#else
+ __uint(max_entries, 1024);
+#endif
+} progs SEC(".maps");
+
+SEC("kprobe/__seccomp_filter")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int sc_nr = (int)PT_REGS_PARM1(ctx);
+
+ /* dispatch into next BPF program depending on syscall number */
+ bpf_tail_call(ctx, &progs, sc_nr);
+
+ /* fall through -> unknown syscall */
+ if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) {
+ char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
+ bpf_trace_printk(fmt, sizeof(fmt), sc_nr);
+ }
+ return 0;
+}
+
+/* we jump here when syscall number == __NR_write */
+PROG(SYS__NR_write)(struct pt_regs *ctx)
+{
+ struct seccomp_data sd;
+
+ bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+ if (sd.args[2] == 512) {
+ char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
+ bpf_trace_printk(fmt, sizeof(fmt),
+ sd.args[0], sd.args[1], sd.args[2]);
+ }
+ return 0;
+}
+
+PROG(SYS__NR_read)(struct pt_regs *ctx)
+{
+ struct seccomp_data sd;
+
+ bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+ if (sd.args[2] > 128 && sd.args[2] <= 1024) {
+ char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
+ bpf_trace_printk(fmt, sizeof(fmt),
+ sd.args[0], sd.args[1], sd.args[2]);
+ }
+ return 0;
+}
+
+#ifdef __NR_mmap2
+PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
+{
+ char fmt[] = "mmap2\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ return 0;
+}
+#endif
+
+#ifdef __NR_mmap
+PROG(SYS__NR_mmap)(struct pt_regs *ctx)
+{
+ char fmt[] = "mmap\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ return 0;
+}
+#endif
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
new file mode 100644
index 000000000000..7e2d8397fb98
--- /dev/null
+++ b/samples/bpf/tracex5_user.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "trace_helpers.h"
+#include "bpf_util.h"
+
+#ifdef __mips__
+#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
+#else
+#define MAX_ENTRIES 1024
+#endif
+
+/* install fake seccomp program to enable seccomp code path inside the kernel,
+ * so that our kprobe attached to seccomp_phase1() can be triggered
+ */
+static void install_accept_all_seccomp(void)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ if (prctl(PR_SET_SECCOMP, 2, &prog))
+ perror("prctl");
+}
+
+int main(int ac, char **argv)
+{
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int key, fd, progs_fd;
+ const char *section;
+ char filename[256];
+ FILE *f;
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
+ if (!prog) {
+ printf("finding a prog in obj file failed\n");
+ goto cleanup;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ link = bpf_program__attach(prog);
+ if (libbpf_get_error(link)) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ link = NULL;
+ goto cleanup;
+ }
+
+ progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
+ if (progs_fd < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ section = bpf_program__section_name(prog);
+ /* register only syscalls to PROG_ARRAY */
+ if (sscanf(section, "kprobe/%d", &key) != 1)
+ continue;
+
+ fd = bpf_program__fd(prog);
+ bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
+ }
+
+ install_accept_all_seccomp();
+
+ f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
+ (void) f;
+
+ read_trace_pipe();
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/tracex6.bpf.c b/samples/bpf/tracex6.bpf.c
new file mode 100644
index 000000000000..9b23b4737cfb
--- /dev/null
+++ b/samples/bpf/tracex6.bpf.c
@@ -0,0 +1,81 @@
+#include "vmlinux.h"
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(u32));
+ __uint(max_entries, 64);
+} counters SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, u64);
+ __uint(max_entries, 64);
+} values SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, struct bpf_perf_event_value);
+ __uint(max_entries, 64);
+} values2 SEC(".maps");
+
+SEC("kprobe/htab_map_get_next_key")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ u64 count, *val;
+ s64 error;
+
+ count = bpf_perf_event_read(&counters, key);
+ error = (s64)count;
+ if (error <= -2 && error >= -22)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values, &key);
+ if (val)
+ *val = count;
+ else
+ bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST);
+
+ return 0;
+}
+
+/*
+ * Since *_map_lookup_elem can't be expected to trigger bpf programs
+ * due to potential deadlocks (bpf_disable_instrumentation), this bpf
+ * program will be attached to bpf_map_copy_value (which is called
+ * from map_lookup_elem) and will only filter the hashtable type.
+ */
+SEC("kprobe/bpf_map_copy_value")
+int BPF_KPROBE(bpf_prog2, struct bpf_map *map)
+{
+ u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *val, buf;
+ enum bpf_map_type type;
+ int error;
+
+ type = BPF_CORE_READ(map, map_type);
+ if (type != BPF_MAP_TYPE_HASH)
+ return 0;
+
+ error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+ if (error)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values2, &key);
+ if (val)
+ *val = buf;
+ else
+ bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000000000000..ae811ac83bc2
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "perf-sys.h"
+
+#define SAMPLE_PERIOD 0x7fffffffffffffffULL
+
+/* counters, values, values2 */
+static int map_fd[3];
+
+static void check_on_cpu(int cpu, struct perf_event_attr *attr)
+{
+ struct bpf_perf_event_value value2;
+ int pmu_fd, error = 0;
+ cpu_set_t set;
+ __u64 value;
+
+ /* Move to target CPU */
+ CPU_ZERO(&set);
+ CPU_SET(cpu, &set);
+ assert(sched_setaffinity(0, sizeof(set), &set) == 0);
+ /* Open perf event and attach to the perf_event_array */
+ pmu_fd = sys_perf_event_open(attr, -1/*pid*/, cpu/*cpu*/, -1/*group_fd*/, 0);
+ if (pmu_fd < 0) {
+ fprintf(stderr, "sys_perf_event_open failed on CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ }
+ assert(bpf_map_update_elem(map_fd[0], &cpu, &pmu_fd, BPF_ANY) == 0);
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
+ /* Trigger the kprobe */
+ bpf_map_get_next_key(map_fd[1], &cpu, NULL);
+ /* Check the value */
+ if (bpf_map_lookup_elem(map_fd[1], &cpu, &value)) {
+ fprintf(stderr, "Value missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+ }
+ /* The above bpf_map_lookup_elem should trigger the second kprobe */
+ if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+ fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+ value2.counter, value2.enabled, value2.running);
+ }
+
+on_exit:
+ assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
+ assert(ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE, 0) == 0 || error);
+ assert(close(pmu_fd) == 0 || error);
+ assert(bpf_map_delete_elem(map_fd[1], &cpu) == 0 || error);
+ exit(error);
+}
+
+static void test_perf_event_array(struct perf_event_attr *attr,
+ const char *name)
+{
+ int i, status, nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+ pid_t pid[nr_cpus];
+ int err = 0;
+
+ printf("Test reading %s counters\n", name);
+
+ for (i = 0; i < nr_cpus; i++) {
+ pid[i] = fork();
+ assert(pid[i] >= 0);
+ if (pid[i] == 0) {
+ check_on_cpu(i, attr);
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ err |= status;
+ }
+
+ if (err)
+ printf("Test: %s FAILED\n", name);
+}
+
+static void test_bpf_perf_event(void)
+{
+ struct perf_event_attr attr_cycles = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HARDWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ struct perf_event_attr attr_clock = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_SOFTWARE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+ struct perf_event_attr attr_raw = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_RAW,
+ .read_format = 0,
+ .sample_type = 0,
+ /* Intel Instruction Retired */
+ .config = 0xc0,
+ };
+ struct perf_event_attr attr_l1d_load = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HW_CACHE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config =
+ PERF_COUNT_HW_CACHE_L1D |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
+ };
+ struct perf_event_attr attr_llc_miss = {
+ .freq = 0,
+ .sample_period = SAMPLE_PERIOD,
+ .inherit = 0,
+ .type = PERF_TYPE_HW_CACHE,
+ .read_format = 0,
+ .sample_type = 0,
+ .config =
+ PERF_COUNT_HW_CACHE_LL |
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+ (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+ };
+ struct perf_event_attr attr_msr_tsc = {
+ .freq = 0,
+ .sample_period = 0,
+ .inherit = 0,
+ /* From /sys/bus/event_source/devices/msr/ */
+ .type = 7,
+ .read_format = 0,
+ .sample_type = 0,
+ .config = 0,
+ };
+
+ test_perf_event_array(&attr_cycles, "HARDWARE-cycles");
+ test_perf_event_array(&attr_clock, "SOFTWARE-clock");
+ test_perf_event_array(&attr_raw, "RAW-instruction-retired");
+ test_perf_event_array(&attr_l1d_load, "HW_CACHE-L1D-load");
+
+ /* below tests may fail in qemu */
+ test_perf_event_array(&attr_llc_miss, "HW_CACHE-LLC-miss");
+ test_perf_event_array(&attr_msr_tsc, "Dynamic-msr-tsc");
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_link *links[2];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ char filename[256];
+ int i = 0;
+
+ snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]);
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj)) {
+ fprintf(stderr, "ERROR: opening BPF object file failed\n");
+ return 0;
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ fprintf(stderr, "ERROR: loading BPF object file failed\n");
+ goto cleanup;
+ }
+
+ map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
+ map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
+ map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
+ if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
+ fprintf(stderr, "ERROR: finding a map in obj file failed\n");
+ goto cleanup;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ links[i] = bpf_program__attach(prog);
+ if (libbpf_get_error(links[i])) {
+ fprintf(stderr, "ERROR: bpf_program__attach failed\n");
+ links[i] = NULL;
+ goto cleanup;
+ }
+ i++;
+ }
+
+ test_bpf_perf_event();
+
+cleanup:
+ for (i--; i >= 0; i--)
+ bpf_link__destroy(links[i]);
+
+ bpf_object__close(obj);
+ return 0;
+}
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000000..4bde9d066c46
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+#
+# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
+# eBPF programs, both for XDP and clsbpf. Shell script function
+# wrappers and even long options parsing is illustrated, for ease of
+# use.
+#
+# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
+# that need to collaborate between XDP and TC hooks. Thus, it is
+# convenient that the same tool load both programs that need to work
+# together.
+#
+BPF_FILE=xdp2skb_meta_kern.o
+DIR=$(dirname $0)
+
+[ -z "$TC" ] && TC=tc
+[ -z "$IP" ] && IP=ip
+
+function usage() {
+ echo ""
+ echo "Usage: $0 [-vfh] --dev ethX"
+ echo " -d | --dev : Network device (required)"
+ echo " --flush : Cleanup flush TC and XDP progs"
+ echo " --list : (\$LIST) List TC and XDP progs"
+ echo " -v | --verbose : (\$VERBOSE) Verbose"
+ echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)"
+ echo ""
+}
+
+## -- General shell logging cmds --
+function err() {
+ local exitcode=$1
+ shift
+ echo "ERROR: $@" >&2
+ exit $exitcode
+}
+
+function info() {
+ if [[ -n "$VERBOSE" ]]; then
+ echo "# $@"
+ fi
+}
+
+## -- Helper function calls --
+
+# Wrapper call for TC and IP
+# - Will display the offending command on failure
+function _call_cmd() {
+ local cmd="$1"
+ local allow_fail="$2"
+ shift 2
+ if [[ -n "$VERBOSE" ]]; then
+ echo "$cmd $@"
+ fi
+ if [[ -n "$DRYRUN" ]]; then
+ return
+ fi
+ $cmd "$@"
+ local status=$?
+ if (( $status != 0 )); then
+ if [[ "$allow_fail" == "" ]]; then
+ err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
+ fi
+ fi
+}
+function call_tc() {
+ _call_cmd "$TC" "" "$@"
+}
+function call_tc_allow_fail() {
+ _call_cmd "$TC" "allow_fail" "$@"
+}
+function call_ip() {
+ _call_cmd "$IP" "" "$@"
+}
+
+## --- Parse command line arguments / parameters ---
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o vfhd: \
+ --long verbose,flush,help,list,dev:,dry-run -- "$@")
+if (( $? != 0 )); then
+ err 4 "Error calling getopt"
+fi
+eval set -- "$OPTIONS"
+
+unset DEV
+unset FLUSH
+while true; do
+ case "$1" in
+ -d | --dev ) # device
+ DEV=$2
+ info "Device set to: DEV=$DEV" >&2
+ shift 2
+ ;;
+ -v | --verbose)
+ VERBOSE=yes
+ # info "Verbose mode: VERBOSE=$VERBOSE" >&2
+ shift
+ ;;
+ --dry-run )
+ DRYRUN=yes
+ VERBOSE=yes
+ info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
+ shift
+ ;;
+ -f | --flush )
+ FLUSH=yes
+ shift
+ ;;
+ --list )
+ LIST=yes
+ shift
+ ;;
+ -- )
+ shift
+ break
+ ;;
+ -h | --help )
+ usage;
+ exit 0
+ ;;
+ * )
+ shift
+ break
+ ;;
+ esac
+done
+
+FILE="$DIR/$BPF_FILE"
+if [[ ! -e $FILE ]]; then
+ err 3 "Missing BPF object file ($FILE)"
+fi
+
+if [[ -z $DEV ]]; then
+ usage
+ err 2 "Please specify network device -- required option --dev"
+fi
+
+## -- Function calls --
+
+function list_tc()
+{
+ local device="$1"
+ shift
+ info "Listing current TC ingress rules"
+ call_tc filter show dev $device ingress
+}
+
+function list_xdp()
+{
+ local device="$1"
+ shift
+ info "Listing current XDP device($device) setting"
+ call_ip link show dev $device | grep --color=auto xdp
+}
+
+function flush_tc()
+{
+ local device="$1"
+ shift
+ info "Flush TC on device: $device"
+ call_tc_allow_fail filter del dev $device ingress
+ call_tc_allow_fail qdisc del dev $device clsact
+}
+
+function flush_xdp()
+{
+ local device="$1"
+ shift
+ info "Flush XDP on device: $device"
+ call_ip link set dev $device xdp off
+}
+
+function attach_tc_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="tc_mark"
+ shift 2
+
+ # Re-attach clsact to clear/flush existing role
+ call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
+ call_tc qdisc add dev $device clsact
+
+ # Attach BPF prog
+ call_tc filter add dev $device ingress \
+ prio 1 handle 1 bpf da obj $file sec $prog
+}
+
+function attach_xdp_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="xdp_mark"
+ shift 2
+
+ # Remove XDP prog in-case it's already loaded
+ # TODO: Need ip-link option to override/replace existing XDP prog
+ flush_xdp $device
+
+ # Attach XDP/BPF prog
+ call_ip link set dev $device xdp obj $file sec $prog
+}
+
+if [[ -n $FLUSH ]]; then
+ flush_tc $DEV
+ flush_xdp $DEV
+ exit 0
+fi
+
+if [[ -n $LIST ]]; then
+ list_tc $DEV
+ list_xdp $DEV
+ exit 0
+fi
+
+attach_tc_mark $DEV $FILE
+attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000000..3c36c25d9902
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto transfer info from XDP to SKB, e.g. skb->mark
+ * -----------------------------------------------------------
+ * This uses the XDP data_meta infrastructure, and is a cooperation
+ * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
+ *
+ * Notice: This example does not use the BPF C-loader,
+ * but instead rely on the iproute2 TC tool for loading BPF-objects.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+/*
+ * This struct is stored in the XDP 'data_meta' area, which is located
+ * just in-front-of the raw packet payload data. The meaning is
+ * specific to these two BPF programs that use it as a communication
+ * channel. XDP adjust/increase the area via a bpf-helper, and TC use
+ * boundary checks to see if data have been provided.
+ *
+ * The struct must be 4 byte aligned, which here is enforced by the
+ * struct __attribute__((aligned(4))).
+ */
+struct meta_info {
+ __u32 mark;
+} __attribute__((aligned(4)));
+
+SEC("xdp_mark")
+int _xdp_mark(struct xdp_md *ctx)
+{
+ struct meta_info *meta;
+ void *data;
+ int ret;
+
+ /* Reserve space in-front of data pointer for our meta info.
+ * (Notice drivers not supporting data_meta will fail here!)
+ */
+ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
+ if (ret < 0)
+ return XDP_ABORTED;
+
+ /* Notice: Kernel-side verifier requires that loading of
+ * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+ * as pkt-data pointers are invalidated. Helpers that require
+ * this are determined/marked by bpf_helper_changes_pkt_data()
+ */
+ data = (void *)(unsigned long)ctx->data;
+
+ /* Check data_meta have room for meta_info struct */
+ meta = (void *)(unsigned long)ctx->data_meta;
+ if (meta + 1 > data)
+ return XDP_ABORTED;
+
+ meta->mark = 42;
+
+ return XDP_PASS;
+}
+
+SEC("tc_mark")
+int _tc_mark(struct __sk_buff *ctx)
+{
+ void *data = (void *)(unsigned long)ctx->data;
+ void *data_meta = (void *)(unsigned long)ctx->data_meta;
+ struct meta_info *meta = data_meta;
+
+ /* Check XDP gave us some data_meta */
+ if (meta + 1 > data) {
+ ctx->mark = 41;
+ /* Skip "accept" if no data_meta is avail */
+ return TC_ACT_OK;
+ }
+
+ /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
+ ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+
+ return TC_ACT_OK;
+}
+
+/* Manually attaching these programs:
+export DEV=ixgbe2
+export FILE=xdp2skb_meta_kern.o
+
+# via TC command
+tc qdisc del dev $DEV clsact 2> /dev/null
+tc qdisc add dev $DEV clsact
+tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
+tc filter show dev $DEV ingress
+
+# XDP via IP command:
+ip link set dev $DEV xdp off
+ip link set dev $DEV xdp obj $FILE sec xdp_mark
+
+# Use iptable to "see" if SKBs are marked
+iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29
+iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a
+
+# Hint: catch XDP_ABORTED errors via
+perf record -e xdp:*
+perf script
+
+*/
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000000..da67bcad1c63
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_tail() by
+ * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
+ * to be more preice in case of v4)" where receiving packets bigger then
+ * 600 bytes.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <bpf/bpf_helpers.h>
+
+#define DEFAULT_TTL 64
+#define MAX_PCKT_SIZE 600
+#define ICMP_TOOBIG_SIZE 98
+#define ICMP_TOOBIG_PAYLOAD_SIZE 92
+
+/* volatile to prevent compiler optimizations */
+static volatile __u32 max_pcktsz = MAX_PCKT_SIZE;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, __u64);
+ __uint(max_entries, 1);
+} icmpcnt SEC(".maps");
+
+static __always_inline void count_icmp(void)
+{
+ u64 key = 0;
+ u64 *icmp_count;
+
+ icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
+ if (icmp_count)
+ *icmp_count += 1;
+}
+
+static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
+{
+ struct ethhdr *eth;
+
+ eth = data;
+ memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
+ memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
+ eth->h_proto = orig_eth->h_proto;
+}
+
+static __always_inline __u16 csum_fold_helper(__u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ return ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline void ipv4_csum(void *data_start, int data_size,
+ __u32 *csum)
+{
+ *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
+ *csum = csum_fold_helper(*csum);
+}
+
+static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
+{
+ int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
+
+ if (bpf_xdp_adjust_head(xdp, 0 - headroom))
+ return XDP_DROP;
+ void *data = (void *)(long)xdp->data;
+ void *data_end = (void *)(long)xdp->data_end;
+
+ if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
+ return XDP_DROP;
+
+ struct iphdr *iph, *orig_iph;
+ struct icmphdr *icmp_hdr;
+ struct ethhdr *orig_eth;
+ __u32 csum = 0;
+ __u64 off = 0;
+
+ orig_eth = data + headroom;
+ swap_mac(data, orig_eth);
+ off += sizeof(struct ethhdr);
+ iph = data + off;
+ off += sizeof(struct iphdr);
+ icmp_hdr = data + off;
+ off += sizeof(struct icmphdr);
+ orig_iph = data + off;
+ icmp_hdr->type = ICMP_DEST_UNREACH;
+ icmp_hdr->code = ICMP_FRAG_NEEDED;
+ icmp_hdr->un.frag.mtu = htons(max_pcktsz - sizeof(struct ethhdr));
+ icmp_hdr->checksum = 0;
+ ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
+ icmp_hdr->checksum = csum;
+ iph->ttl = DEFAULT_TTL;
+ iph->daddr = orig_iph->saddr;
+ iph->saddr = orig_iph->daddr;
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->protocol = IPPROTO_ICMP;
+ iph->tos = 0;
+ iph->tot_len = htons(
+ ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
+ iph->check = 0;
+ csum = 0;
+ ipv4_csum(iph, sizeof(struct iphdr), &csum);
+ iph->check = csum;
+ count_icmp();
+ return XDP_TX;
+}
+
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ int pckt_size = data_end - data;
+ int offset;
+
+ if (pckt_size > max(max_pcktsz, ICMP_TOOBIG_SIZE)) {
+ offset = pckt_size - ICMP_TOOBIG_SIZE;
+ if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+ return XDP_PASS;
+ return send_icmp4_too_big(xdp);
+ }
+ return XDP_PASS;
+}
+
+SEC("xdp_icmp")
+int _xdp_icmp(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000000..e9426bd65420
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define STATS_INTERVAL_S 2U
+#define MAX_PCKT_SIZE 600
+
+static int ifindex = -1;
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static __u32 prog_id;
+
+static void int_exit(int sig)
+{
+ __u32 curr_prog_id = 0;
+
+ if (ifindex > -1) {
+ if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+ printf("bpf_xdp_query_id failed\n");
+ exit(1);
+ }
+ if (prog_id == curr_prog_id)
+ bpf_xdp_detach(ifindex, xdp_flags, NULL);
+ else if (!curr_prog_id)
+ printf("couldn't find a prog id on a given iface\n");
+ else
+ printf("program on interface changed, not removing\n");
+ }
+ exit(0);
+}
+
+/* simple "icmp packet too big sent" counter
+ */
+static void poll_stats(unsigned int map_fd, unsigned int kill_after_s)
+{
+ time_t started_at = time(NULL);
+ __u64 value = 0;
+ int key = 0;
+
+
+ while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+ sleep(STATS_INTERVAL_S);
+
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+
+ printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
+ }
+}
+
+static void usage(const char *cmd)
+{
+ printf("Start a XDP prog which send ICMP \"packet too big\" \n"
+ "messages if ingress packet is bigger then MAX_SIZE bytes\n");
+ printf("Usage: %s [...]\n", cmd);
+ printf(" -i <ifname|ifindex> Interface\n");
+ printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
+ printf(" -P <MAX_PCKT_SIZE> Default: %u\n", MAX_PCKT_SIZE);
+ printf(" -S use skb-mode\n");
+ printf(" -N enforce native mode\n");
+ printf(" -F force loading prog\n");
+ printf(" -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+ unsigned char opt_flags[256] = {};
+ const char *optstr = "i:T:P:SNFh";
+ struct bpf_prog_info info = {};
+ __u32 info_len = sizeof(info);
+ unsigned int kill_after_s = 0;
+ int i, prog_fd, map_fd, opt;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 max_pckt_size = 0;
+ __u32 key = 0;
+ char filename[256];
+ int err;
+
+ for (i = 0; i < strlen(optstr); i++)
+ if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+ opt_flags[(unsigned char)optstr[i]] = 1;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+
+ switch (opt) {
+ case 'i':
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
+ break;
+ case 'T':
+ kill_after_s = atoi(optarg);
+ break;
+ case 'P':
+ max_pckt_size = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ /* default, set below */
+ break;
+ case 'F':
+ xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ opt_flags[opt] = 0;
+ }
+
+ if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ for (i = 0; i < strlen(optstr); i++) {
+ if (opt_flags[(unsigned int)optstr[i]]) {
+ fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj))
+ return 1;
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+ err = bpf_object__load(obj);
+ if (err)
+ return 1;
+
+ prog_fd = bpf_program__fd(prog);
+
+ /* static global var 'max_pcktsz' is accessible from .data section */
+ if (max_pckt_size) {
+ map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data");
+ if (map_fd < 0) {
+ printf("finding a max_pcktsz map in obj file failed\n");
+ return 1;
+ }
+ bpf_map_update_elem(map_fd, &key, &max_pckt_size, BPF_ANY);
+ }
+
+ /* fetch icmpcnt map */
+ map_fd = bpf_object__find_map_fd_by_name(obj, "icmpcnt");
+ if (map_fd < 0) {
+ printf("finding a icmpcnt map in obj file failed\n");
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
+ if (err) {
+ printf("can't get prog info - %s\n", strerror(errno));
+ return 1;
+ }
+ prog_id = info.id;
+
+ poll_stats(map_fd, kill_after_s);
+ int_exit(0);
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..54c099cbd639
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <bpf/bpf_helpers.h>
+
+#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, 64);
+} xdp_tx_ports SEC(".maps");
+
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+ u32 check = (__force u32)iph->check;
+
+ check += (__force u32)htons(0x0100);
+ iph->check = (__force __sum16)(check + (check >= 0xFFFF));
+ return --iph->ttl;
+}
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct bpf_fib_lookup fib_params;
+ struct ethhdr *eth = data;
+ struct ipv6hdr *ip6h;
+ struct iphdr *iph;
+ u16 h_proto;
+ u64 nh_off;
+ int rc;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ __builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+ h_proto = eth->h_proto;
+ if (h_proto == htons(ETH_P_IP)) {
+ iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ if (iph->ttl <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET;
+ fib_params.tos = iph->tos;
+ fib_params.l4_protocol = iph->protocol;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(iph->tot_len);
+ fib_params.ipv4_src = iph->saddr;
+ fib_params.ipv4_dst = iph->daddr;
+ } else if (h_proto == htons(ETH_P_IPV6)) {
+ struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
+
+ ip6h = data + nh_off;
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ if (ip6h->hop_limit <= 1)
+ return XDP_PASS;
+
+ fib_params.family = AF_INET6;
+ fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+ fib_params.l4_protocol = ip6h->nexthdr;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = ntohs(ip6h->payload_len);
+ *src = ip6h->saddr;
+ *dst = ip6h->daddr;
+ } else {
+ return XDP_PASS;
+ }
+
+ fib_params.ifindex = ctx->ingress_ifindex;
+
+ rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+ /*
+ * Some rc (return codes) from bpf_fib_lookup() are important,
+ * to understand how this XDP-prog interacts with network stack.
+ *
+ * BPF_FIB_LKUP_RET_NO_NEIGH:
+ * Even if route lookup was a success, then the MAC-addresses are also
+ * needed. This is obtained from arp/neighbour table, but if table is
+ * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid
+ * doing ARP lookup directly from XDP, then send packet to normal
+ * network stack via XDP_PASS and expect it will do ARP resolution.
+ *
+ * BPF_FIB_LKUP_RET_FWD_DISABLED:
+ * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding
+ * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not
+ * enabled this on ingress device.
+ */
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS) {
+ /* Verify egress index has been configured as TX-port.
+ * (Note: User can still have inserted an egress ifindex that
+ * doesn't support XDP xmit, which will result in packet drops).
+ *
+ * Note: lookup in devmap supported since 0cdbb4b09a0.
+ * If not supported will fail with:
+ * cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+ */
+ if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex))
+ return XDP_PASS;
+
+ if (h_proto == htons(ETH_P_IP))
+ ip_decrease_ttl(iph);
+ else if (h_proto == htons(ETH_P_IPV6))
+ ip6h->hop_limit--;
+
+ memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+ memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+ return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0);
+ }
+
+ return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+ return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..193b3b79b31f
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
+static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
+{
+ int err;
+
+ err = bpf_xdp_attach(idx, prog_fd, xdp_flags, NULL);
+ if (err < 0) {
+ printf("ERROR: failed to attach program to %s\n", name);
+ return err;
+ }
+
+ /* Adding ifindex as a possible egress TX port */
+ err = bpf_map_update_elem(map_fd, &idx, &idx, 0);
+ if (err)
+ printf("ERROR: failed using device %s as TX-port\n", name);
+
+ return err;
+}
+
+static int do_detach(int ifindex, const char *ifname, const char *app_name)
+{
+ LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
+ struct bpf_prog_info prog_info = {};
+ char prog_name[BPF_OBJ_NAME_LEN];
+ __u32 info_len, curr_prog_id;
+ int prog_fd;
+ int err = 1;
+
+ if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+ printf("ERROR: bpf_xdp_query_id failed (%s)\n",
+ strerror(errno));
+ return err;
+ }
+
+ if (!curr_prog_id) {
+ printf("ERROR: flags(0x%x) xdp prog is not attached to %s\n",
+ xdp_flags, ifname);
+ return err;
+ }
+
+ info_len = sizeof(prog_info);
+ prog_fd = bpf_prog_get_fd_by_id(curr_prog_id);
+ if (prog_fd < 0) {
+ printf("ERROR: bpf_prog_get_fd_by_id failed (%s)\n",
+ strerror(errno));
+ return prog_fd;
+ }
+
+ err = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &info_len);
+ if (err) {
+ printf("ERROR: bpf_prog_get_info_by_fd failed (%s)\n",
+ strerror(errno));
+ goto close_out;
+ }
+ snprintf(prog_name, sizeof(prog_name), "%s_prog", app_name);
+ prog_name[BPF_OBJ_NAME_LEN - 1] = '\0';
+
+ if (strcmp(prog_info.name, prog_name)) {
+ printf("ERROR: %s isn't attached to %s\n", app_name, ifname);
+ err = 1;
+ goto close_out;
+ }
+
+ opts.old_prog_fd = prog_fd;
+ err = bpf_xdp_detach(ifindex, xdp_flags, &opts);
+ if (err < 0)
+ printf("ERROR: failed to detach program from %s (%s)\n",
+ ifname, strerror(errno));
+ /* TODO: Remember to cleanup map, when adding use of shared map
+ * bpf_map_delete_elem((map_fd, &idx);
+ */
+close_out:
+ close(prog_fd);
+ return err;
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] interface-list\n"
+ "\nOPTS:\n"
+ " -d detach program\n"
+ " -S use skb-mode\n"
+ " -F force loading prog\n"
+ " -D direct table lookups (skip fib rules)\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ const char *prog_name = "xdp_fwd";
+ struct bpf_program *prog = NULL;
+ struct bpf_program *pos;
+ const char *sec_name;
+ int prog_fd = -1, map_fd = -1;
+ char filename[PATH_MAX];
+ struct bpf_object *obj;
+ int opt, i, idx, err;
+ int attach = 1;
+ int ret = 0;
+
+ while ((opt = getopt(argc, argv, ":dDSF")) != -1) {
+ switch (opt) {
+ case 'd':
+ attach = 0;
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'F':
+ xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+ break;
+ case 'D':
+ prog_name = "xdp_fwd_direct";
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ if (attach) {
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (access(filename, O_RDONLY) < 0) {
+ printf("error accessing file %s: %s\n",
+ filename, strerror(errno));
+ return 1;
+ }
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj))
+ return 1;
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+ err = bpf_object__load(obj);
+ if (err) {
+ printf("Does kernel support devmap lookup?\n");
+ /* If not, the error message will be:
+ * "cannot pass map_type 14 into func bpf_map_lookup_elem#1"
+ */
+ return 1;
+ }
+
+ bpf_object__for_each_program(pos, obj) {
+ sec_name = bpf_program__section_name(pos);
+ if (sec_name && !strcmp(sec_name, prog_name)) {
+ prog = pos;
+ break;
+ }
+ }
+ prog_fd = bpf_program__fd(prog);
+ if (prog_fd < 0) {
+ printf("program not found: %s\n", strerror(prog_fd));
+ return 1;
+ }
+ map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj,
+ "xdp_tx_ports"));
+ if (map_fd < 0) {
+ printf("map not found: %s\n", strerror(map_fd));
+ return 1;
+ }
+ }
+
+ for (i = optind; i < argc; ++i) {
+ idx = if_nametoindex(argv[i]);
+ if (!idx)
+ idx = strtoul(argv[i], NULL, 0);
+
+ if (!idx) {
+ fprintf(stderr, "Invalid arg\n");
+ return 1;
+ }
+ if (!attach) {
+ err = do_detach(idx, argv[i], prog_name);
+ if (err)
+ ret = err;
+ } else {
+ err = do_attach(idx, prog_fd, map_fd, argv[i]);
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
diff --git a/samples/bpf/xdp_router_ipv4.bpf.c b/samples/bpf/xdp_router_ipv4.bpf.c
new file mode 100644
index 000000000000..0643330d1d2e
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4.bpf.c
@@ -0,0 +1,189 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
+
+#define ETH_ALEN 6
+#define ETH_P_8021Q 0x8100
+#define ETH_P_8021AD 0x88A8
+
+struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+};
+
+/* Key for lpm_trie */
+union key_4 {
+ u32 b32[2];
+ u8 b8[8];
+};
+
+struct arp_entry {
+ __be64 mac;
+ __be32 dst;
+};
+
+struct direct_map {
+ struct arp_entry arp;
+ int ifindex;
+ __be64 mac;
+};
+
+/* Map for trie implementation */
+struct {
+ __uint(type, BPF_MAP_TYPE_LPM_TRIE);
+ __uint(key_size, 8);
+ __uint(value_size, sizeof(struct trie_value));
+ __uint(max_entries, 50);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} lpm_map SEC(".maps");
+
+/* Map for ARP table */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __be32);
+ __type(value, __be64);
+ __uint(max_entries, 50);
+} arp_table SEC(".maps");
+
+/* Map to keep the exact match entries in the route table */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __be32);
+ __type(value, struct direct_map);
+ __uint(max_entries, 50);
+} exact_match SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, 100);
+} tx_port SEC(".maps");
+
+SEC("xdp")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u64 nh_off = sizeof(*eth);
+ struct datarec *rec;
+ __be16 h_proto;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (rec)
+ NO_TEAR_INC(rec->processed);
+
+ if (data + nh_off > data_end)
+ goto drop;
+
+ h_proto = eth->h_proto;
+ if (h_proto == bpf_htons(ETH_P_8021Q) ||
+ h_proto == bpf_htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ goto drop;
+
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+
+ switch (bpf_ntohs(h_proto)) {
+ case ETH_P_ARP:
+ if (rec)
+ NO_TEAR_INC(rec->xdp_pass);
+ return XDP_PASS;
+ case ETH_P_IP: {
+ struct iphdr *iph = data + nh_off;
+ struct direct_map *direct_entry;
+ __be64 *dest_mac, *src_mac;
+ int forward_to;
+
+ if (iph + 1 > data_end)
+ goto drop;
+
+ direct_entry = bpf_map_lookup_elem(&exact_match, &iph->daddr);
+
+ /* Check for exact match, this would give a faster lookup */
+ if (direct_entry && direct_entry->mac &&
+ direct_entry->arp.mac) {
+ src_mac = &direct_entry->mac;
+ dest_mac = &direct_entry->arp.mac;
+ forward_to = direct_entry->ifindex;
+ } else {
+ struct trie_value *prefix_value;
+ union key_4 key4;
+
+ /* Look up in the trie for lpm */
+ key4.b32[0] = 32;
+ key4.b8[4] = iph->daddr & 0xff;
+ key4.b8[5] = (iph->daddr >> 8) & 0xff;
+ key4.b8[6] = (iph->daddr >> 16) & 0xff;
+ key4.b8[7] = (iph->daddr >> 24) & 0xff;
+
+ prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
+ if (!prefix_value)
+ goto drop;
+
+ forward_to = prefix_value->ifindex;
+ src_mac = &prefix_value->value;
+ if (!src_mac)
+ goto drop;
+
+ dest_mac = bpf_map_lookup_elem(&arp_table, &iph->daddr);
+ if (!dest_mac) {
+ if (!prefix_value->gw)
+ goto drop;
+
+ dest_mac = bpf_map_lookup_elem(&arp_table,
+ &prefix_value->gw);
+ if (!dest_mac) {
+ /* Forward the packet to the kernel in
+ * order to trigger ARP discovery for
+ * the default gw.
+ */
+ if (rec)
+ NO_TEAR_INC(rec->xdp_pass);
+ return XDP_PASS;
+ }
+ }
+ }
+
+ if (src_mac && dest_mac) {
+ int ret;
+
+ __builtin_memcpy(eth->h_dest, dest_mac, ETH_ALEN);
+ __builtin_memcpy(eth->h_source, src_mac, ETH_ALEN);
+
+ ret = bpf_redirect_map(&tx_port, forward_to, 0);
+ if (ret == XDP_REDIRECT) {
+ if (rec)
+ NO_TEAR_INC(rec->xdp_redirect);
+ return ret;
+ }
+ }
+ }
+ default:
+ break;
+ }
+drop:
+ if (rec)
+ NO_TEAR_INC(rec->xdp_drop);
+
+ return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
new file mode 100644
index 000000000000..266fdd0b025d
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -0,0 +1,699 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2017 Cavium, Inc.
+ */
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include "bpf_util.h"
+#include <bpf/libbpf.h>
+#include <libgen.h>
+#include <getopt.h>
+#include <pthread.h>
+#include "xdp_sample_user.h"
+#include "xdp_router_ipv4.skel.h"
+
+static const char *__doc__ =
+"XDP IPv4 router implementation\n"
+"Usage: xdp_router_ipv4 <IFNAME-0> ... <IFNAME-N>\n";
+
+static char buf[8192];
+static int lpm_map_fd;
+static int arp_table_map_fd;
+static int exact_match_map_fd;
+static int tx_port_map_fd;
+
+static bool routes_thread_exit;
+static int interval = 5;
+
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+ SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT;
+
+DEFINE_SAMPLE_INIT(xdp_router_ipv4);
+
+static const struct option long_options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "skb-mode", no_argument, NULL, 'S' },
+ { "force", no_argument, NULL, 'F' },
+ { "interval", required_argument, NULL, 'i' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "stats", no_argument, NULL, 's' },
+ {}
+};
+
+static int get_route_table(int rtm_family);
+
+static int recv_msg(struct sockaddr_nl sock_addr, int sock)
+{
+ struct nlmsghdr *nh;
+ int len, nll = 0;
+ char *buf_ptr;
+
+ buf_ptr = buf;
+ while (1) {
+ len = recv(sock, buf_ptr, sizeof(buf) - nll, 0);
+ if (len < 0)
+ return len;
+
+ nh = (struct nlmsghdr *)buf_ptr;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ buf_ptr += len;
+ nll += len;
+ if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH)
+ break;
+
+ if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE)
+ break;
+ }
+ return nll;
+}
+
+/* Function to parse the route entry returned by netlink
+ * Updates the route entry related map entries
+ */
+static void read_route(struct nlmsghdr *nh, int nll)
+{
+ char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
+ struct bpf_lpm_trie_key_u8 *prefix_key;
+ struct rtattr *rt_attr;
+ struct rtmsg *rt_msg;
+ int rtm_family;
+ int rtl;
+ int i;
+ struct route_table {
+ int dst_len, iface, metric;
+ __be32 dst, gw;
+ __be64 mac;
+ } route;
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ };
+
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ memset(&route, 0, sizeof(route));
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
+ rtm_family = rt_msg->rtm_family;
+ if (rtm_family == AF_INET)
+ if (rt_msg->rtm_table != RT_TABLE_MAIN)
+ continue;
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ rtl = RTM_PAYLOAD(nh);
+
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ (*((__be32 *)RTA_DATA(rt_attr))));
+ break;
+ case RTA_GATEWAY:
+ sprintf(gws, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_OIF:
+ sprintf(ifs, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_METRICS:
+ sprintf(metrics, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ default:
+ break;
+ }
+ }
+ sprintf(dsts_len, "%d", rt_msg->rtm_dst_len);
+ route.dst = atoi(dsts);
+ route.dst_len = atoi(dsts_len);
+ route.gw = atoi(gws);
+ route.iface = atoi(ifs);
+ route.metric = atoi(metrics);
+ assert(get_mac_addr(route.iface, &route.mac) == 0);
+ assert(bpf_map_update_elem(tx_port_map_fd,
+ &route.iface, &route.iface, 0) == 0);
+ if (rtm_family == AF_INET) {
+ struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+ } *prefix_value;
+
+ prefix_key = alloca(sizeof(*prefix_key) + 4);
+ prefix_value = alloca(sizeof(*prefix_value));
+
+ prefix_key->prefixlen = 32;
+ prefix_key->prefixlen = route.dst_len;
+ direct_entry.mac = route.mac & 0xffffffffffff;
+ direct_entry.ifindex = route.iface;
+ direct_entry.arp.mac = 0;
+ direct_entry.arp.dst = 0;
+ if (route.dst_len == 32) {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(exact_match_map_fd,
+ &route.dst) == 0);
+ } else {
+ if (bpf_map_lookup_elem(arp_table_map_fd,
+ &route.dst,
+ &direct_entry.arp.mac) == 0)
+ direct_entry.arp.dst = route.dst;
+ assert(bpf_map_update_elem(exact_match_map_fd,
+ &route.dst,
+ &direct_entry, 0) == 0);
+ }
+ }
+ for (i = 0; i < 4; i++)
+ prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
+
+ if (bpf_map_lookup_elem(lpm_map_fd, prefix_key,
+ prefix_value) < 0) {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] = prefix_key->data[i];
+ prefix_value->value = route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+
+ assert(bpf_map_update_elem(lpm_map_fd,
+ prefix_key,
+ prefix_value, 0
+ ) == 0);
+ } else {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(lpm_map_fd,
+ prefix_key
+ ) == 0);
+ /* Rereading the route table to check if
+ * there is an entry with the same
+ * prefix but a different metric as the
+ * deleted entry.
+ */
+ get_route_table(AF_INET);
+ } else if (prefix_key->data[0] ==
+ prefix_value->prefix[0] &&
+ prefix_key->data[1] ==
+ prefix_value->prefix[1] &&
+ prefix_key->data[2] ==
+ prefix_value->prefix[2] &&
+ prefix_key->data[3] ==
+ prefix_value->prefix[3] &&
+ route.metric >= prefix_value->metric) {
+ continue;
+ } else {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] =
+ prefix_key->data[i];
+ prefix_value->value =
+ route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+ assert(bpf_map_update_elem(lpm_map_fd,
+ prefix_key,
+ prefix_value,
+ 0) == 0);
+ }
+ }
+ }
+ memset(&route, 0, sizeof(route));
+ memset(dsts, 0, sizeof(dsts));
+ memset(dsts_len, 0, sizeof(dsts_len));
+ memset(gws, 0, sizeof(gws));
+ memset(ifs, 0, sizeof(ifs));
+ memset(&route, 0, sizeof(route));
+ }
+}
+
+/* Function to read the existing route table when the process is launched*/
+static int get_route_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+
+ struct {
+ struct nlmsghdr nl;
+ struct rtmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return -errno;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETROUTE;
+
+ req.rt.rtm_family = rtm_family;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+ ret = nll;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to parse the arp entry returned by netlink
+ * Updates the arp entry related map entries
+ */
+static void read_arp(struct nlmsghdr *nh, int nll)
+{
+ struct rtattr *rt_attr;
+ char dsts[24], mac[24];
+ struct ndmsg *rt_msg;
+ int rtl, ndm_family;
+
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ } arp_entry;
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ ndm_family = rt_msg->ndm_family;
+ rtl = RTM_PAYLOAD(nh);
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case NDA_LLADDR:
+ sprintf(mac, "%lld",
+ *((__be64 *)RTA_DATA(rt_attr)));
+ break;
+ default:
+ break;
+ }
+ }
+ arp_entry.dst = atoi(dsts);
+ arp_entry.mac = atol(mac);
+
+ if (ndm_family == AF_INET) {
+ if (bpf_map_lookup_elem(exact_match_map_fd,
+ &arp_entry.dst,
+ &direct_entry) == 0) {
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ direct_entry.arp.dst = 0;
+ direct_entry.arp.mac = 0;
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ direct_entry.arp.dst = arp_entry.dst;
+ direct_entry.arp.mac = arp_entry.mac;
+ }
+ assert(bpf_map_update_elem(exact_match_map_fd,
+ &arp_entry.dst,
+ &direct_entry, 0
+ ) == 0);
+ memset(&direct_entry, 0, sizeof(direct_entry));
+ }
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ assert(bpf_map_delete_elem(arp_table_map_fd,
+ &arp_entry.dst) == 0);
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ assert(bpf_map_update_elem(arp_table_map_fd,
+ &arp_entry.dst,
+ &arp_entry.mac, 0
+ ) == 0);
+ }
+ }
+ memset(&arp_entry, 0, sizeof(arp_entry));
+ memset(dsts, 0, sizeof(dsts));
+ }
+}
+
+/* Function to read the existing arp table when the process is launched*/
+static int get_arp_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+ struct {
+ struct nlmsghdr nl;
+ struct ndmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return -errno;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETNEIGH;
+ req.rt.ndm_state = NUD_REACHABLE;
+ req.rt.ndm_family = rtm_family;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+ ret = nll;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to keep track and update changes in route and arp table
+ * Give regular statistics of packets forwarded
+ */
+static void *monitor_routes_thread(void *arg)
+{
+ struct pollfd fds_route, fds_arp;
+ struct sockaddr_nl la, lr;
+ int sock, sock_arp, nll;
+ struct nlmsghdr *nh;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return NULL;
+ }
+
+ fcntl(sock, F_SETFL, O_NONBLOCK);
+ memset(&lr, 0, sizeof(lr));
+ lr.nl_family = AF_NETLINK;
+ lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
+ if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ close(sock);
+ return NULL;
+ }
+
+ fds_route.fd = sock;
+ fds_route.events = POLL_IN;
+
+ sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock_arp < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ close(sock);
+ return NULL;
+ }
+
+ fcntl(sock_arp, F_SETFL, O_NONBLOCK);
+ memset(&la, 0, sizeof(la));
+ la.nl_family = AF_NETLINK;
+ la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
+ if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ fds_arp.fd = sock_arp;
+ fds_arp.events = POLL_IN;
+
+ /* dump route and arp tables */
+ if (get_arp_table(AF_INET) < 0) {
+ fprintf(stderr, "Failed reading arp table\n");
+ goto cleanup;
+ }
+
+ if (get_route_table(AF_INET) < 0) {
+ fprintf(stderr, "Failed reading route table\n");
+ goto cleanup;
+ }
+
+ while (!routes_thread_exit) {
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_route, 1, 3) == POLL_IN) {
+ nll = recv_msg(lr, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n",
+ strerror(nll));
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+ }
+
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_arp, 1, 3) == POLL_IN) {
+ nll = recv_msg(la, sock_arp);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n",
+ strerror(nll));
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+ }
+
+ sleep(interval);
+ }
+
+cleanup:
+ close(sock_arp);
+ close(sock);
+ return NULL;
+}
+
+static void usage(char *argv[], const struct option *long_options,
+ const char *doc, int mask, bool error,
+ struct bpf_object *obj)
+{
+ sample_usage(argv, long_options, doc, mask, error);
+}
+
+int main(int argc, char **argv)
+{
+ bool error = true, generic = false, force = false;
+ int opt, ret = EXIT_FAIL_BPF;
+ struct xdp_router_ipv4 *skel;
+ int i, total_ifindex = argc - 1;
+ char **ifname_list = argv + 1;
+ pthread_t routes_thread;
+ int longindex = 0;
+
+ if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) {
+ fprintf(stderr, "Failed to set libbpf strict mode: %s\n",
+ strerror(errno));
+ goto end;
+ }
+
+ skel = xdp_router_ipv4__open();
+ if (!skel) {
+ fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n",
+ strerror(errno));
+ goto end;
+ }
+
+ ret = sample_init_pre_load(skel);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to sample_init_pre_load: %s\n",
+ strerror(-ret));
+ ret = EXIT_FAIL_BPF;
+ goto end_destroy;
+ }
+
+ ret = xdp_router_ipv4__load(skel);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n",
+ strerror(errno));
+ goto end_destroy;
+ }
+
+ ret = sample_init(skel, mask);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_destroy;
+ }
+
+ while ((opt = getopt_long(argc, argv, "si:SFvh",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 's':
+ mask |= SAMPLE_REDIRECT_MAP_CNT;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'i':
+ interval = strtoul(optarg, NULL, 0);
+ total_ifindex -= 2;
+ ifname_list += 2;
+ break;
+ case 'S':
+ generic = true;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'F':
+ force = true;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'v':
+ sample_switch_mode();
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'h':
+ error = false;
+ default:
+ usage(argv, long_options, __doc__, mask, error, skel->obj);
+ goto end_destroy;
+ }
+ }
+
+ ret = EXIT_FAIL_OPTION;
+ if (optind == argc) {
+ usage(argv, long_options, __doc__, mask, true, skel->obj);
+ goto end_destroy;
+ }
+
+ lpm_map_fd = bpf_map__fd(skel->maps.lpm_map);
+ if (lpm_map_fd < 0) {
+ fprintf(stderr, "Failed loading lpm_map %s\n",
+ strerror(-lpm_map_fd));
+ goto end_destroy;
+ }
+ arp_table_map_fd = bpf_map__fd(skel->maps.arp_table);
+ if (arp_table_map_fd < 0) {
+ fprintf(stderr, "Failed loading arp_table_map_fd %s\n",
+ strerror(-arp_table_map_fd));
+ goto end_destroy;
+ }
+ exact_match_map_fd = bpf_map__fd(skel->maps.exact_match);
+ if (exact_match_map_fd < 0) {
+ fprintf(stderr, "Failed loading exact_match_map_fd %s\n",
+ strerror(-exact_match_map_fd));
+ goto end_destroy;
+ }
+ tx_port_map_fd = bpf_map__fd(skel->maps.tx_port);
+ if (tx_port_map_fd < 0) {
+ fprintf(stderr, "Failed loading tx_port_map_fd %s\n",
+ strerror(-tx_port_map_fd));
+ goto end_destroy;
+ }
+
+ ret = EXIT_FAIL_XDP;
+ for (i = 0; i < total_ifindex; i++) {
+ int index = if_nametoindex(ifname_list[i]);
+
+ if (!index) {
+ fprintf(stderr, "Interface %s not found %s\n",
+ ifname_list[i], strerror(-tx_port_map_fd));
+ goto end_destroy;
+ }
+ if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog,
+ index, generic, force) < 0)
+ goto end_destroy;
+ }
+
+ ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL);
+ if (ret) {
+ fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_destroy;
+ }
+
+ ret = sample_run(interval, NULL, NULL);
+ routes_thread_exit = true;
+
+ if (ret < 0) {
+ fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_thread_wait;
+ }
+ ret = EXIT_OK;
+
+end_thread_wait:
+ pthread_join(routes_thread, NULL);
+end_destroy:
+ xdp_router_ipv4__destroy(skel);
+end:
+ sample_exit(ret);
+}
diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c
new file mode 100644
index 000000000000..0eb7e1dcae22
--- /dev/null
+++ b/samples/bpf/xdp_sample.bpf.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+/* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */
+#include "xdp_sample.bpf.h"
+
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+array_map rx_cnt SEC(".maps");
+array_map redir_err_cnt SEC(".maps");
+array_map cpumap_enqueue_cnt SEC(".maps");
+array_map cpumap_kthread_cnt SEC(".maps");
+array_map exception_cnt SEC(".maps");
+array_map devmap_xmit_cnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(max_entries, 32 * 32);
+ __type(key, u64);
+ __type(value, struct datarec);
+} devmap_xmit_cnt_multi SEC(".maps");
+
+const volatile int nr_cpus = 0;
+
+/* These can be set before loading so that redundant comparisons can be DCE'd by
+ * the verifier, and only actual matches are tried after loading tp_btf program.
+ * This allows sample to filter tracepoint stats based on net_device.
+ */
+const volatile int from_match[32] = {};
+const volatile int to_match[32] = {};
+
+int cpumap_map_id = 0;
+
+/* Find if b is part of set a, but if a is empty set then evaluate to true */
+#define IN_SET(a, b) \
+ ({ \
+ bool __res = !(a)[0]; \
+ for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \
+ __res = (a)[i] == (b); \
+ if (__res) \
+ break; \
+ } \
+ __res; \
+ })
+
+static __always_inline __u32 xdp_get_err_key(int err)
+{
+ switch (err) {
+ case 0:
+ return 0;
+ case -EINVAL:
+ return 2;
+ case -ENETDOWN:
+ return 3;
+ case -EMSGSIZE:
+ return 4;
+ case -EOPNOTSUPP:
+ return 5;
+ case -ENOSPC:
+ return 6;
+ default:
+ return 1;
+ }
+}
+
+static __always_inline int xdp_redirect_collect_stat(int from, int err)
+{
+ u32 cpu = bpf_get_smp_processor_id();
+ u32 key = XDP_REDIRECT_ERROR;
+ struct datarec *rec;
+ u32 idx;
+
+ if (!IN_SET(from_match, from))
+ return 0;
+
+ key = xdp_get_err_key(err);
+
+ idx = key * nr_cpus + cpu;
+ rec = bpf_map_lookup_elem(&redir_err_cnt, &idx);
+ if (!rec)
+ return 0;
+ if (key)
+ NO_TEAR_INC(rec->dropped);
+ else
+ NO_TEAR_INC(rec->processed);
+ return 0; /* Indicate event was filtered (no further processing)*/
+ /*
+ * Returning 1 here would allow e.g. a perf-record tracepoint
+ * to see and record these events, but it doesn't work well
+ * in-practice as stopping perf-record also unload this
+ * bpf_prog. Plus, there is additional overhead of doing so.
+ */
+}
+
+SEC("tp_btf/xdp_redirect_err")
+int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev,
+ const struct bpf_prog *xdp, const void *tgt, int err,
+ const struct bpf_map *map, u32 index)
+{
+ return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect_map_err")
+int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev,
+ const struct bpf_prog *xdp, const void *tgt, int err,
+ const struct bpf_map *map, u32 index)
+{
+ return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect")
+int BPF_PROG(tp_xdp_redirect, const struct net_device *dev,
+ const struct bpf_prog *xdp, const void *tgt, int err,
+ const struct bpf_map *map, u32 index)
+{
+ return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_redirect_map")
+int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev,
+ const struct bpf_prog *xdp, const void *tgt, int err,
+ const struct bpf_map *map, u32 index)
+{
+ return xdp_redirect_collect_stat(dev->ifindex, err);
+}
+
+SEC("tp_btf/xdp_cpumap_enqueue")
+int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed,
+ unsigned int drops, int to_cpu)
+{
+ u32 cpu = bpf_get_smp_processor_id();
+ struct datarec *rec;
+ u32 idx;
+
+ if (cpumap_map_id && cpumap_map_id != map_id)
+ return 0;
+
+ idx = to_cpu * nr_cpus + cpu;
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx);
+ if (!rec)
+ return 0;
+ NO_TEAR_ADD(rec->processed, processed);
+ NO_TEAR_ADD(rec->dropped, drops);
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (processed > 0)
+ NO_TEAR_INC(rec->issue);
+ /* Inception: It's possible to detect overload situations, via
+ * this tracepoint. This can be used for creating a feedback
+ * loop to XDP, which can take appropriate actions to mitigate
+ * this overload situation.
+ */
+ return 0;
+}
+
+SEC("tp_btf/xdp_cpumap_kthread")
+int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed,
+ unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats)
+{
+ struct datarec *rec;
+ u32 cpu;
+
+ if (cpumap_map_id && cpumap_map_id != map_id)
+ return 0;
+
+ cpu = bpf_get_smp_processor_id();
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu);
+ if (!rec)
+ return 0;
+ NO_TEAR_ADD(rec->processed, processed);
+ NO_TEAR_ADD(rec->dropped, drops);
+ NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass);
+ NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop);
+ NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect);
+ /* Count times kthread yielded CPU via schedule call */
+ if (sched)
+ NO_TEAR_INC(rec->issue);
+ return 0;
+}
+
+SEC("tp_btf/xdp_exception")
+int BPF_PROG(tp_xdp_exception, const struct net_device *dev,
+ const struct bpf_prog *xdp, u32 act)
+{
+ u32 cpu = bpf_get_smp_processor_id();
+ struct datarec *rec;
+ u32 key = act, idx;
+
+ if (!IN_SET(from_match, dev->ifindex))
+ return 0;
+ if (!IN_SET(to_match, dev->ifindex))
+ return 0;
+
+ if (key > XDP_REDIRECT)
+ key = XDP_REDIRECT + 1;
+
+ idx = key * nr_cpus + cpu;
+ rec = bpf_map_lookup_elem(&exception_cnt, &idx);
+ if (!rec)
+ return 0;
+ NO_TEAR_INC(rec->dropped);
+
+ return 0;
+}
+
+SEC("tp_btf/xdp_devmap_xmit")
+int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev,
+ const struct net_device *to_dev, int sent, int drops, int err)
+{
+ struct datarec *rec;
+ int idx_in, idx_out;
+ u32 cpu;
+
+ idx_in = from_dev->ifindex;
+ idx_out = to_dev->ifindex;
+
+ if (!IN_SET(from_match, idx_in))
+ return 0;
+ if (!IN_SET(to_match, idx_out))
+ return 0;
+
+ cpu = bpf_get_smp_processor_id();
+ rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu);
+ if (!rec)
+ return 0;
+ NO_TEAR_ADD(rec->processed, sent);
+ NO_TEAR_ADD(rec->dropped, drops);
+ /* Record bulk events, then userspace can calc average bulk size */
+ NO_TEAR_INC(rec->info);
+ /* Record error cases, where no frame were sent */
+ /* Catch API error of drv ndo_xdp_xmit sent more than count */
+ if (err || drops < 0)
+ NO_TEAR_INC(rec->issue);
+ return 0;
+}
+
+SEC("tp_btf/xdp_devmap_xmit")
+int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev,
+ const struct net_device *to_dev, int sent, int drops, int err)
+{
+ struct datarec empty = {};
+ struct datarec *rec;
+ int idx_in, idx_out;
+ u64 idx;
+
+ idx_in = from_dev->ifindex;
+ idx_out = to_dev->ifindex;
+ idx = idx_in;
+ idx = idx << 32 | idx_out;
+
+ if (!IN_SET(from_match, idx_in))
+ return 0;
+ if (!IN_SET(to_match, idx_out))
+ return 0;
+
+ bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST);
+ rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx);
+ if (!rec)
+ return 0;
+
+ NO_TEAR_ADD(rec->processed, sent);
+ NO_TEAR_ADD(rec->dropped, drops);
+ NO_TEAR_INC(rec->info);
+ if (err || drops < 0)
+ NO_TEAR_INC(rec->issue);
+ return 0;
+}
diff --git a/samples/bpf/xdp_sample.bpf.h b/samples/bpf/xdp_sample.bpf.h
new file mode 100644
index 000000000000..fecc41c5df04
--- /dev/null
+++ b/samples/bpf/xdp_sample.bpf.h
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _XDP_SAMPLE_BPF_H
+#define _XDP_SAMPLE_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+#include "net_shared.h"
+#include "xdp_sample_shared.h"
+
+#define EINVAL 22
+#define ENETDOWN 100
+#define EMSGSIZE 90
+#define EOPNOTSUPP 95
+#define ENOSPC 28
+
+typedef struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __type(key, unsigned int);
+ __type(value, struct datarec);
+} array_map;
+
+extern array_map rx_cnt;
+extern const volatile int nr_cpus;
+
+enum {
+ XDP_REDIRECT_SUCCESS = 0,
+ XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+/*
+ * Note: including linux/compiler.h or linux/kernel.h for the macros below
+ * conflicts with vmlinux.h include in BPF files, so we define them here.
+ *
+ * Following functions are taken from kernel sources and
+ * break aliasing rules in their original form.
+ *
+ * While kernel is compiled with -fno-strict-aliasing,
+ * perf uses -Wstrict-aliasing=3 which makes build fail
+ * under gcc 4.4.
+ *
+ * Using extra __may_alias__ type to allow aliasing
+ * in this case.
+ */
+typedef __u8 __attribute__((__may_alias__)) __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+ switch (size) {
+ case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break;
+ case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+ case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+ case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+ default:
+ asm volatile ("" : : : "memory");
+ __builtin_memcpy((void *)res, (const void *)p, size);
+ asm volatile ("" : : : "memory");
+ }
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+ switch (size) {
+ case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break;
+ case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+ case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+ case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+ default:
+ asm volatile ("" : : : "memory");
+ __builtin_memcpy((void *)p, (const void *)res, size);
+ asm volatile ("" : : : "memory");
+ }
+}
+
+#define READ_ONCE(x) \
+({ \
+ union { typeof(x) __val; char __c[1]; } __u = \
+ { .__c = { 0 } }; \
+ __read_once_size(&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
+})
+
+#define WRITE_ONCE(x, val) \
+({ \
+ union { typeof(x) __val; char __c[1]; } __u = \
+ { .__val = (val) }; \
+ __write_once_size(&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
+})
+
+/* Add a value using relaxed read and relaxed write. Less expensive than
+ * fetch_add when there is no write concurrency.
+ */
+#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val))
+#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1)
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#endif
diff --git a/samples/bpf/xdp_sample_shared.h b/samples/bpf/xdp_sample_shared.h
new file mode 100644
index 000000000000..8a7669a5d563
--- /dev/null
+++ b/samples/bpf/xdp_sample_shared.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _XDP_SAMPLE_SHARED_H
+#define _XDP_SAMPLE_SHARED_H
+
+struct datarec {
+ size_t processed;
+ size_t dropped;
+ size_t issue;
+ union {
+ size_t xdp_pass;
+ size_t info;
+ };
+ size_t xdp_drop;
+ size_t xdp_redirect;
+} __attribute__((aligned(64)));
+
+#endif
diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c
new file mode 100644
index 000000000000..158682852162
--- /dev/null
+++ b/samples/bpf/xdp_sample_user.c
@@ -0,0 +1,1673 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/ethtool.h>
+#include <linux/hashtable.h>
+#include <linux/if_link.h>
+#include <linux/jhash.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/sockios.h>
+#include <locale.h>
+#include <math.h>
+#include <net/if.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/signalfd.h>
+#include <sys/sysinfo.h>
+#include <sys/timerfd.h>
+#include <sys/utsname.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "bpf_util.h"
+#include "xdp_sample_user.h"
+
+#define __sample_print(fmt, cond, ...) \
+ ({ \
+ if (cond) \
+ printf(fmt, ##__VA_ARGS__); \
+ })
+
+#define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__)
+#define print_default(fmt, ...) \
+ __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__)
+#define __print_err(err, fmt, ...) \
+ ({ \
+ __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT, \
+ ##__VA_ARGS__); \
+ sample_err_exp = sample_err_exp ? true : err > 0; \
+ })
+#define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__)
+
+#define __COLUMN(x) "%'10" x " %-13s"
+#define FMT_COLUMNf __COLUMN(".0f")
+#define FMT_COLUMNd __COLUMN("d")
+#define FMT_COLUMNl __COLUMN("llu")
+#define RX(rx) rx, "rx/s"
+#define PPS(pps) pps, "pkt/s"
+#define DROP(drop) drop, "drop/s"
+#define ERR(err) err, "error/s"
+#define HITS(hits) hits, "hit/s"
+#define XMIT(xmit) xmit, "xmit/s"
+#define PASS(pass) pass, "pass/s"
+#define REDIR(redir) redir, "redir/s"
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+
+#define XDP_UNKNOWN (XDP_REDIRECT + 1)
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+#define XDP_REDIRECT_ERR_MAX 7
+
+enum map_type {
+ MAP_RX,
+ MAP_REDIRECT_ERR,
+ MAP_CPUMAP_ENQUEUE,
+ MAP_CPUMAP_KTHREAD,
+ MAP_EXCEPTION,
+ MAP_DEVMAP_XMIT,
+ MAP_DEVMAP_XMIT_MULTI,
+ NUM_MAP,
+};
+
+enum log_level {
+ LL_DEFAULT = 1U << 0,
+ LL_SIMPLE = 1U << 1,
+ LL_DEBUG = 1U << 2,
+};
+
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+
+struct map_entry {
+ struct hlist_node node;
+ __u64 pair;
+ struct record val;
+};
+
+struct stats_record {
+ struct record rx_cnt;
+ struct record redir_err[XDP_REDIRECT_ERR_MAX];
+ struct record kthread;
+ struct record exception[XDP_ACTION_MAX];
+ struct record devmap_xmit;
+ DECLARE_HASHTABLE(xmit_map, 5);
+ struct record enq[];
+};
+
+struct sample_output {
+ struct {
+ __u64 rx;
+ __u64 redir;
+ __u64 drop;
+ __u64 drop_xmit;
+ __u64 err;
+ __u64 xmit;
+ } totals;
+ struct {
+ union {
+ __u64 pps;
+ __u64 num;
+ };
+ __u64 drop;
+ __u64 err;
+ } rx_cnt;
+ struct {
+ __u64 suc;
+ __u64 err;
+ } redir_cnt;
+ struct {
+ __u64 hits;
+ } except_cnt;
+ struct {
+ __u64 pps;
+ __u64 drop;
+ __u64 err;
+ double bavg;
+ } xmit_cnt;
+};
+
+struct xdp_desc {
+ int ifindex;
+ __u32 prog_id;
+ int flags;
+} sample_xdp_progs[32];
+
+struct datarec *sample_mmap[NUM_MAP];
+struct bpf_map *sample_map[NUM_MAP];
+size_t sample_map_count[NUM_MAP];
+enum log_level sample_log_level;
+struct sample_output sample_out;
+unsigned long sample_interval;
+bool sample_err_exp;
+int sample_xdp_cnt;
+int sample_n_cpus;
+int sample_sig_fd;
+int sample_mask;
+
+static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = {
+ /* Key=1 keeps unknown errors */
+ "Success",
+ "Unknown",
+ "EINVAL",
+ "ENETDOWN",
+ "EMSGSIZE",
+ "EOPNOTSUPP",
+ "ENOSPC",
+};
+
+/* Keyed from Unknown */
+static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = {
+ "Unknown error",
+ "Invalid redirection",
+ "Device being redirected to is down",
+ "Packet length too large for device",
+ "Operation not supported",
+ "No space in ptr_ring of cpumap kthread",
+};
+
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+ [XDP_REDIRECT] = "XDP_REDIRECT",
+ [XDP_UNKNOWN] = "XDP_UNKNOWN",
+};
+
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ return UINT64_MAX;
+ }
+ return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
+
+static void sample_print_help(int mask)
+{
+ printf("Output format description\n\n"
+ "By default, redirect success statistics are disabled, use -s to enable.\n"
+ "The terse output mode is default, verbose mode can be activated using -v\n"
+ "Use SIGQUIT (Ctrl + \\) to switch the mode dynamically at runtime\n\n"
+ "Terse mode displays at most the following fields:\n"
+ " rx/s Number of packets received per second\n"
+ " redir/s Number of packets successfully redirected per second\n"
+ " err,drop/s Aggregated count of errors per second (including dropped packets)\n"
+ " xmit/s Number of packets transmitted on the output device per second\n\n"
+ "Output description for verbose mode:\n"
+ " FIELD DESCRIPTION\n");
+
+ if (mask & SAMPLE_RX_CNT) {
+ printf(" receive\t\tDisplays the number of packets received & errors encountered\n"
+ " \t\t\tWhenever an error or packet drop occurs, details of per CPU error\n"
+ " \t\t\tand drop statistics will be expanded inline in terse mode.\n"
+ " \t\t\t\tpkt/s - Packets received per second\n"
+ " \t\t\t\tdrop/s - Packets dropped per second\n"
+ " \t\t\t\terror/s - Errors encountered per second\n\n");
+ }
+ if (mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) {
+ printf(" redirect\t\tDisplays the number of packets successfully redirected\n"
+ " \t\t\tErrors encountered are expanded under redirect_err field\n"
+ " \t\t\tNote that passing -s to enable it has a per packet overhead\n"
+ " \t\t\t\tredir/s - Packets redirected successfully per second\n\n"
+ " redirect_err\t\tDisplays the number of packets that failed redirection\n"
+ " \t\t\tThe errno is expanded under this field with per CPU count\n"
+ " \t\t\tThe recognized errors are:\n");
+
+ for (int i = 2; i < XDP_REDIRECT_ERR_MAX; i++)
+ printf("\t\t\t %s: %s\n", xdp_redirect_err_names[i],
+ xdp_redirect_err_help[i - 1]);
+
+ printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n");
+ }
+
+ if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) {
+ printf(" enqueue to cpu N\tDisplays the number of packets enqueued to bulk queue of CPU N\n"
+ " \t\t\tExpands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N\n"
+ " \t\t\tReceived packets can be associated with the CPU redirect program is enqueuing \n"
+ " \t\t\tpackets to.\n"
+ " \t\t\t\tpkt/s - Packets enqueued per second from other CPU to CPU N\n"
+ " \t\t\t\tdrop/s - Packets dropped when trying to enqueue to CPU N\n"
+ " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n");
+ }
+
+ if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+ printf(" kthread\t\tDisplays the number of packets processed in CPUMAP kthread for each CPU\n"
+ " \t\t\tPackets consumed from ptr_ring in kthread, and its xdp_stats (after calling \n"
+ " \t\t\tCPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and\n"
+ " \t\t\tthen per-CPU to associate it to each CPU's pinned CPUMAP kthread.\n"
+ " \t\t\t\tpkt/s - Packets consumed per second from ptr_ring\n"
+ " \t\t\t\tdrop/s - Packets dropped per second in kthread\n"
+ " \t\t\t\tsched - Number of times kthread called schedule()\n\n"
+ " \t\t\txdp_stats (also expands to per-CPU counts)\n"
+ " \t\t\t\tpass/s - XDP_PASS count for CPUMAP program execution\n"
+ " \t\t\t\tdrop/s - XDP_DROP count for CPUMAP program execution\n"
+ " \t\t\t\tredir/s - XDP_REDIRECT count for CPUMAP program execution\n\n");
+ }
+
+ if (mask & SAMPLE_EXCEPTION_CNT) {
+ printf(" xdp_exception\t\tDisplays xdp_exception tracepoint events\n"
+ " \t\t\tThis can occur due to internal driver errors, unrecognized\n"
+ " \t\t\tXDP actions and due to explicit user trigger by use of XDP_ABORTED\n"
+ " \t\t\tEach action is expanded below this field with its count\n"
+ " \t\t\t\thit/s - Number of times the tracepoint was hit per second\n\n");
+ }
+
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT) {
+ printf(" devmap_xmit\t\tDisplays devmap_xmit tracepoint events\n"
+ " \t\t\tThis tracepoint is invoked for successful transmissions on output\n"
+ " \t\t\tdevice but these statistics are not available for generic XDP mode,\n"
+ " \t\t\thence they will be omitted from the output when using SKB mode\n"
+ " \t\t\t\txmit/s - Number of packets that were transmitted per second\n"
+ " \t\t\t\tdrop/s - Number of packets that failed transmissions per second\n"
+ " \t\t\t\tdrv_err/s - Number of internal driver errors per second\n"
+ " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n");
+ }
+}
+
+void sample_usage(char *argv[], const struct option *long_options,
+ const char *doc, int mask, bool error)
+{
+ int i;
+
+ if (!error)
+ sample_print_help(mask);
+
+ printf("\n%s\nOption for %s:\n", doc, argv[0]);
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-15s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value: %d)",
+ *long_options[i].flag);
+ else
+ printf("\t short-option: -%c", long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = libbpf_num_possible_cpus();
+ struct datarec *array;
+
+ array = calloc(nr_cpus, sizeof(*array));
+ if (!array) {
+ fprintf(stderr, "Failed to allocate memory (nr_cpus: %u)\n",
+ nr_cpus);
+ return NULL;
+ }
+ return array;
+}
+
+static int map_entry_init(struct map_entry *e, __u64 pair)
+{
+ e->pair = pair;
+ INIT_HLIST_NODE(&e->node);
+ e->val.timestamp = gettime();
+ e->val.cpu = alloc_record_per_cpu();
+ if (!e->val.cpu)
+ return -ENOMEM;
+ return 0;
+}
+
+static void map_collect_percpu(struct datarec *values, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = libbpf_num_possible_cpus();
+ __u64 sum_xdp_redirect = 0;
+ __u64 sum_processed = 0;
+ __u64 sum_xdp_pass = 0;
+ __u64 sum_xdp_drop = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = READ_ONCE(values[i].processed);
+ rec->cpu[i].dropped = READ_ONCE(values[i].dropped);
+ rec->cpu[i].issue = READ_ONCE(values[i].issue);
+ rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass);
+ rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop);
+ rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect);
+
+ sum_processed += rec->cpu[i].processed;
+ sum_dropped += rec->cpu[i].dropped;
+ sum_issue += rec->cpu[i].issue;
+ sum_xdp_pass += rec->cpu[i].xdp_pass;
+ sum_xdp_drop += rec->cpu[i].xdp_drop;
+ sum_xdp_redirect += rec->cpu[i].xdp_redirect;
+ }
+
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.issue = sum_issue;
+ rec->total.xdp_pass = sum_xdp_pass;
+ rec->total.xdp_drop = sum_xdp_drop;
+ rec->total.xdp_redirect = sum_xdp_redirect;
+}
+
+static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u32 batch, count = 32;
+ struct datarec *values;
+ bool init = false;
+ __u64 *keys;
+ int i, ret;
+
+ keys = calloc(count, sizeof(__u64));
+ if (!keys)
+ return -ENOMEM;
+ values = calloc(count * nr_cpus, sizeof(struct datarec));
+ if (!values) {
+ free(keys);
+ return -ENOMEM;
+ }
+
+ for (;;) {
+ bool exit = false;
+
+ ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch,
+ keys, values, &count, NULL);
+ if (ret < 0 && errno != ENOENT)
+ break;
+ if (errno == ENOENT)
+ exit = true;
+
+ init = true;
+ for (i = 0; i < count; i++) {
+ struct map_entry *e, *x = NULL;
+ __u64 pair = keys[i];
+ struct datarec *arr;
+
+ arr = &values[i * nr_cpus];
+ hash_for_each_possible(rec->xmit_map, e, node, pair) {
+ if (e->pair == pair) {
+ x = e;
+ break;
+ }
+ }
+ if (!x) {
+ x = calloc(1, sizeof(*x));
+ if (!x)
+ goto cleanup;
+ if (map_entry_init(x, pair) < 0) {
+ free(x);
+ goto cleanup;
+ }
+ hash_add(rec->xmit_map, &x->node, pair);
+ }
+ map_collect_percpu(arr, &x->val);
+ }
+
+ if (exit)
+ break;
+ count = 32;
+ }
+
+ free(values);
+ free(keys);
+ return 0;
+cleanup:
+ free(values);
+ free(keys);
+ return -ENOMEM;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int i;
+
+ rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record));
+ if (!rec) {
+ fprintf(stderr, "Failed to allocate memory\n");
+ return NULL;
+ }
+
+ if (sample_mask & SAMPLE_RX_CNT) {
+ rec->rx_cnt.cpu = alloc_record_per_cpu();
+ if (!rec->rx_cnt.cpu) {
+ fprintf(stderr,
+ "Failed to allocate rx_cnt per-CPU array\n");
+ goto end_rec;
+ }
+ }
+ if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) {
+ for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) {
+ rec->redir_err[i].cpu = alloc_record_per_cpu();
+ if (!rec->redir_err[i].cpu) {
+ fprintf(stderr,
+ "Failed to allocate redir_err per-CPU array for "
+ "\"%s\" case\n",
+ xdp_redirect_err_names[i]);
+ while (i--)
+ free(rec->redir_err[i].cpu);
+ goto end_rx_cnt;
+ }
+ }
+ }
+ if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+ rec->kthread.cpu = alloc_record_per_cpu();
+ if (!rec->kthread.cpu) {
+ fprintf(stderr,
+ "Failed to allocate kthread per-CPU array\n");
+ goto end_redir;
+ }
+ }
+ if (sample_mask & SAMPLE_EXCEPTION_CNT) {
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ rec->exception[i].cpu = alloc_record_per_cpu();
+ if (!rec->exception[i].cpu) {
+ fprintf(stderr,
+ "Failed to allocate exception per-CPU array for "
+ "\"%s\" case\n",
+ action2str(i));
+ while (i--)
+ free(rec->exception[i].cpu);
+ goto end_kthread;
+ }
+ }
+ }
+ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) {
+ rec->devmap_xmit.cpu = alloc_record_per_cpu();
+ if (!rec->devmap_xmit.cpu) {
+ fprintf(stderr,
+ "Failed to allocate devmap_xmit per-CPU array\n");
+ goto end_exception;
+ }
+ }
+ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+ hash_init(rec->xmit_map);
+ if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) {
+ for (i = 0; i < sample_n_cpus; i++) {
+ rec->enq[i].cpu = alloc_record_per_cpu();
+ if (!rec->enq[i].cpu) {
+ fprintf(stderr,
+ "Failed to allocate enqueue per-CPU array for "
+ "CPU %d\n",
+ i);
+ while (i--)
+ free(rec->enq[i].cpu);
+ goto end_devmap_xmit;
+ }
+ }
+ }
+
+ return rec;
+
+end_devmap_xmit:
+ free(rec->devmap_xmit.cpu);
+end_exception:
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ free(rec->exception[i].cpu);
+end_kthread:
+ free(rec->kthread.cpu);
+end_redir:
+ for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++)
+ free(rec->redir_err[i].cpu);
+end_rx_cnt:
+ free(rec->rx_cnt.cpu);
+end_rec:
+ free(rec);
+ return NULL;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ struct hlist_node *tmp;
+ struct map_entry *e;
+ int i;
+
+ for (i = 0; i < sample_n_cpus; i++)
+ free(r->enq[i].cpu);
+ hash_for_each_safe(r->xmit_map, i, tmp, e, node) {
+ hash_del(&e->node);
+ free(e->val.cpu);
+ free(e);
+ }
+ free(r->devmap_xmit.cpu);
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ free(r->exception[i].cpu);
+ free(r->kthread.cpu);
+ for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++)
+ free(r->redir_err[i].cpu);
+ free(r->rx_cnt.cpu);
+ free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double)period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double sample_round(double val)
+{
+ if (val - floor(val) < 0.5)
+ return floor(val);
+ return ceil(val);
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = sample_round(packets / period_);
+ }
+ return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->dropped - p->dropped;
+ pps = sample_round(packets / period_);
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = sample_round(packets / period_);
+ }
+ return pps;
+}
+
+static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->info - p->info;
+ pps = sample_round(packets / period_);
+ }
+ return pps;
+}
+
+static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass,
+ double *xdp_drop, double *xdp_redirect, double period_)
+{
+ *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0;
+ if (period_ > 0) {
+ *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_;
+ *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_;
+ *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_;
+ }
+}
+
+static void stats_get_rx_cnt(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus, struct sample_output *out)
+{
+ struct record *rec, *prev;
+ double t, pps, drop, err;
+ int i;
+
+ rec = &stats_rec->rx_cnt;
+ prev = &stats_prev->rx_cnt;
+ t = calc_period(rec, prev);
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (!pps && !drop && !err)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ "\n",
+ str, PPS(pps), DROP(drop), ERR(err));
+ }
+
+ if (out) {
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+
+ out->rx_cnt.pps = pps;
+ out->rx_cnt.drop = drop;
+ out->rx_cnt.err = err;
+ out->totals.rx += pps;
+ out->totals.drop += drop;
+ out->totals.err += err;
+ }
+}
+
+static void stats_get_cpumap_enqueue(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus)
+{
+ struct record *rec, *prev;
+ double t, pps, drop, err;
+ int i, to_cpu;
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) {
+ rec = &stats_rec->enq[to_cpu];
+ prev = &stats_prev->enq[to_cpu];
+ t = calc_period(rec, prev);
+
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+
+ if (pps > 0 || drop > 0) {
+ char str[64];
+
+ snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu);
+
+ if (err > 0)
+ err = pps / err; /* calc average bulk size */
+
+ print_err(drop,
+ " %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN(
+ ".2f") "\n",
+ str, PPS(pps), DROP(drop), err, "bulk-avg");
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (!pps && !drop && !err)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu);
+ if (err > 0)
+ err = pps / err; /* calc average bulk size */
+ print_default(
+ " %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN(
+ ".2f") "\n",
+ str, PPS(pps), DROP(drop), err, "bulk-avg");
+ }
+ }
+}
+
+static void stats_get_cpumap_remote(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus)
+{
+ double xdp_pass, xdp_drop, xdp_redirect;
+ struct record *rec, *prev;
+ double t;
+ int i;
+
+ rec = &stats_rec->kthread;
+ prev = &stats_prev->kthread;
+ t = calc_period(rec, prev);
+
+ calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop,
+ &xdp_redirect, t);
+ if (xdp_pass || xdp_drop || xdp_redirect) {
+ print_err(xdp_drop,
+ " %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n",
+ "xdp_stats", PASS(xdp_pass), DROP(xdp_drop),
+ REDIR(xdp_redirect));
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t);
+ if (!xdp_pass && !xdp_drop && !xdp_redirect)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ "\n",
+ str, PASS(xdp_pass), DROP(xdp_drop),
+ REDIR(xdp_redirect));
+ }
+}
+
+static void stats_get_cpumap_kthread(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus)
+{
+ struct record *rec, *prev;
+ double t, pps, drop, err;
+ int i;
+
+ rec = &stats_rec->kthread;
+ prev = &stats_prev->kthread;
+ t = calc_period(rec, prev);
+
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+
+ print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n",
+ pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err,
+ "sched");
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (!pps && !drop && !err)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ "\n",
+ str, PPS(pps), DROP(drop), err, "sched");
+ }
+}
+
+static void stats_get_redirect_cnt(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus,
+ struct sample_output *out)
+{
+ struct record *rec, *prev;
+ double t, pps;
+ int i;
+
+ rec = &stats_rec->redir_err[0];
+ prev = &stats_prev->redir_err[0];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ pps = calc_pps(r, p, t);
+ if (!pps)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-18s " FMT_COLUMNf "\n", str, REDIR(pps));
+ }
+
+ if (out) {
+ pps = calc_pps(&rec->total, &prev->total, t);
+ out->redir_cnt.suc = pps;
+ out->totals.redir += pps;
+ }
+}
+
+static void stats_get_redirect_err_cnt(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus,
+ struct sample_output *out)
+{
+ struct record *rec, *prev;
+ double t, drop, sum = 0;
+ int rec_i, i;
+
+ for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) {
+ char str[64];
+
+ rec = &stats_rec->redir_err[rec_i];
+ prev = &stats_prev->redir_err[rec_i];
+ t = calc_period(rec, prev);
+
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ if (drop > 0 && !out) {
+ snprintf(str, sizeof(str),
+ sample_log_level & LL_DEFAULT ? "%s total" :
+ "%s",
+ xdp_redirect_err_names[rec_i]);
+ print_err(drop, " %-18s " FMT_COLUMNf "\n", str,
+ ERR(drop));
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ double drop;
+
+ drop = calc_drop_pps(r, p, t);
+ if (!drop)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-16s" FMT_COLUMNf "\n", str,
+ ERR(drop));
+ }
+
+ sum += drop;
+ }
+
+ if (out) {
+ out->redir_cnt.err = sum;
+ out->totals.err += sum;
+ }
+}
+
+static void stats_get_exception_cnt(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus,
+ struct sample_output *out)
+{
+ double t, drop, sum = 0;
+ struct record *rec, *prev;
+ int rec_i, i;
+
+ for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+ rec = &stats_rec->exception[rec_i];
+ prev = &stats_prev->exception[rec_i];
+ t = calc_period(rec, prev);
+
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ /* Fold out errors after heading */
+ sum += drop;
+
+ if (drop > 0 && !out) {
+ print_always(" %-18s " FMT_COLUMNf "\n",
+ action2str(rec_i), ERR(drop));
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+ double drop;
+
+ drop = calc_drop_pps(r, p, t);
+ if (!drop)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ print_default(" %-16s" FMT_COLUMNf "\n",
+ str, ERR(drop));
+ }
+ }
+ }
+
+ if (out) {
+ out->except_cnt.hits = sum;
+ out->totals.err += sum;
+ }
+}
+
+static void stats_get_devmap_xmit(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus,
+ struct sample_output *out)
+{
+ double pps, drop, info, err;
+ struct record *rec, *prev;
+ double t;
+ int i;
+
+ rec = &stats_rec->devmap_xmit;
+ prev = &stats_prev->devmap_xmit;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+ char str[64];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+
+ if (!pps && !drop && !err)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ info = calc_info_pps(r, p, t);
+ if (info > 0)
+ info = (pps + drop) / info; /* calc avg bulk */
+ print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ __COLUMN(".2f") "\n",
+ str, XMIT(pps), DROP(drop), err, "drv_err/s",
+ info, "bulk-avg");
+ }
+ if (out) {
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ info = calc_info_pps(&rec->total, &prev->total, t);
+ if (info > 0)
+ info = (pps + drop) / info; /* calc avg bulk */
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+
+ out->xmit_cnt.pps = pps;
+ out->xmit_cnt.drop = drop;
+ out->xmit_cnt.bavg = info;
+ out->xmit_cnt.err = err;
+ out->totals.xmit += pps;
+ out->totals.drop_xmit += drop;
+ out->totals.err += err;
+ }
+}
+
+static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ unsigned int nr_cpus,
+ struct sample_output *out,
+ bool xmit_total)
+{
+ double pps, drop, info, err;
+ struct map_entry *entry;
+ struct record *r, *p;
+ double t;
+ int bkt;
+
+ hash_for_each(stats_rec->xmit_map, bkt, entry, node) {
+ struct map_entry *e, *x = NULL;
+ char ifname_from[IFNAMSIZ];
+ char ifname_to[IFNAMSIZ];
+ const char *fstr, *tstr;
+ unsigned long prev_time;
+ struct record beg = {};
+ __u32 from_idx, to_idx;
+ char str[128];
+ __u64 pair;
+ int i;
+
+ prev_time = sample_interval * NANOSEC_PER_SEC;
+
+ pair = entry->pair;
+ from_idx = pair >> 32;
+ to_idx = pair & 0xFFFFFFFF;
+
+ r = &entry->val;
+ beg.timestamp = r->timestamp - prev_time;
+
+ /* Find matching entry from stats_prev map */
+ hash_for_each_possible(stats_prev->xmit_map, e, node, pair) {
+ if (e->pair == pair) {
+ x = e;
+ break;
+ }
+ }
+ if (x)
+ p = &x->val;
+ else
+ p = &beg;
+ t = calc_period(r, p);
+ pps = calc_pps(&r->total, &p->total, t);
+ drop = calc_drop_pps(&r->total, &p->total, t);
+ info = calc_info_pps(&r->total, &p->total, t);
+ if (info > 0)
+ info = (pps + drop) / info; /* calc avg bulk */
+ err = calc_errs_pps(&r->total, &p->total, t);
+
+ if (out) {
+ /* We are responsible for filling out totals */
+ out->totals.xmit += pps;
+ out->totals.drop_xmit += drop;
+ out->totals.err += err;
+ continue;
+ }
+
+ fstr = tstr = NULL;
+ if (if_indextoname(from_idx, ifname_from))
+ fstr = ifname_from;
+ if (if_indextoname(to_idx, ifname_to))
+ tstr = ifname_to;
+
+ snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?",
+ tstr ?: "?");
+ /* Skip idle streams of redirection */
+ if (pps || drop || err) {
+ print_err(drop,
+ " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop),
+ err, "drv_err/s", info, "bulk-avg");
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *rc = &r->cpu[i];
+ struct datarec *pc, p_beg = {};
+ char str[64];
+
+ pc = p == &beg ? &p_beg : &p->cpu[i];
+
+ pps = calc_pps(rc, pc, t);
+ drop = calc_drop_pps(rc, pc, t);
+ err = calc_errs_pps(rc, pc, t);
+
+ if (!pps && !drop && !err)
+ continue;
+
+ snprintf(str, sizeof(str), "cpu:%d", i);
+ info = calc_info_pps(rc, pc, t);
+ if (info > 0)
+ info = (pps + drop) / info; /* calc avg bulk */
+
+ print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf
+ __COLUMN(".2f") "\n", str, XMIT(pps),
+ DROP(drop), err, "drv_err/s", info, "bulk-avg");
+ }
+ }
+}
+
+static void stats_print(const char *prefix, int mask, struct stats_record *r,
+ struct stats_record *p, struct sample_output *out)
+{
+ int nr_cpus = libbpf_num_possible_cpus();
+ const char *str;
+
+ print_always("%-23s", prefix ?: "Summary");
+ if (mask & SAMPLE_RX_CNT)
+ print_always(FMT_COLUMNl, RX(out->totals.rx));
+ if (mask & SAMPLE_REDIRECT_CNT)
+ print_always(FMT_COLUMNl, REDIR(out->totals.redir));
+ printf(FMT_COLUMNl,
+ out->totals.err + out->totals.drop + out->totals.drop_xmit,
+ "err,drop/s");
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT ||
+ mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+ printf(FMT_COLUMNl, XMIT(out->totals.xmit));
+ printf("\n");
+
+ if (mask & SAMPLE_RX_CNT) {
+ str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ?
+ "receive total" :
+ "receive";
+ print_err((out->rx_cnt.err || out->rx_cnt.drop),
+ " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n",
+ str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop),
+ ERR(out->rx_cnt.err));
+
+ stats_get_rx_cnt(r, p, nr_cpus, NULL);
+ }
+
+ if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT)
+ stats_get_cpumap_enqueue(r, p, nr_cpus);
+
+ if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) {
+ stats_get_cpumap_kthread(r, p, nr_cpus);
+ stats_get_cpumap_remote(r, p, nr_cpus);
+ }
+
+ if (mask & SAMPLE_REDIRECT_CNT) {
+ str = out->redir_cnt.suc ? "redirect total" : "redirect";
+ print_default(" %-20s " FMT_COLUMNl "\n", str,
+ REDIR(out->redir_cnt.suc));
+
+ stats_get_redirect_cnt(r, p, nr_cpus, NULL);
+ }
+
+ if (mask & SAMPLE_REDIRECT_ERR_CNT) {
+ str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ?
+ "redirect_err total" :
+ "redirect_err";
+ print_err(out->redir_cnt.err, " %-20s " FMT_COLUMNl "\n", str,
+ ERR(out->redir_cnt.err));
+
+ stats_get_redirect_err_cnt(r, p, nr_cpus, NULL);
+ }
+
+ if (mask & SAMPLE_EXCEPTION_CNT) {
+ str = out->except_cnt.hits ? "xdp_exception total" :
+ "xdp_exception";
+
+ print_err(out->except_cnt.hits, " %-20s " FMT_COLUMNl "\n", str,
+ HITS(out->except_cnt.hits));
+
+ stats_get_exception_cnt(r, p, nr_cpus, NULL);
+ }
+
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT) {
+ str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ?
+ "devmap_xmit total" :
+ "devmap_xmit";
+
+ print_err(out->xmit_cnt.err || out->xmit_cnt.drop,
+ " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl
+ __COLUMN(".2f") "\n",
+ str, XMIT(out->xmit_cnt.pps),
+ DROP(out->xmit_cnt.drop), out->xmit_cnt.err,
+ "drv_err/s", out->xmit_cnt.bavg, "bulk-avg");
+
+ stats_get_devmap_xmit(r, p, nr_cpus, NULL);
+ }
+
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+ stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL,
+ mask & SAMPLE_DEVMAP_XMIT_CNT);
+
+ if (sample_log_level & LL_DEFAULT ||
+ ((sample_log_level & LL_SIMPLE) && sample_err_exp)) {
+ sample_err_exp = false;
+ printf("\n");
+ }
+}
+
+int sample_setup_maps(struct bpf_map **maps)
+{
+ sample_n_cpus = libbpf_num_possible_cpus();
+
+ for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) {
+ sample_map[i] = maps[i];
+
+ switch (i) {
+ case MAP_RX:
+ case MAP_CPUMAP_KTHREAD:
+ case MAP_DEVMAP_XMIT:
+ sample_map_count[i] = sample_n_cpus;
+ break;
+ case MAP_REDIRECT_ERR:
+ sample_map_count[i] =
+ XDP_REDIRECT_ERR_MAX * sample_n_cpus;
+ break;
+ case MAP_EXCEPTION:
+ sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus;
+ case MAP_CPUMAP_ENQUEUE:
+ sample_map_count[i] = sample_n_cpus * sample_n_cpus;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (bpf_map__set_max_entries(sample_map[i], sample_map_count[i]) < 0)
+ return -errno;
+ }
+ sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI];
+ return 0;
+}
+
+static int sample_setup_maps_mappings(void)
+{
+ for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) {
+ size_t size = sample_map_count[i] * sizeof(struct datarec);
+
+ sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, bpf_map__fd(sample_map[i]), 0);
+ if (sample_mmap[i] == MAP_FAILED)
+ return -errno;
+ }
+ return 0;
+}
+
+int __sample_init(int mask)
+{
+ sigset_t st;
+
+ sigemptyset(&st);
+ sigaddset(&st, SIGQUIT);
+ sigaddset(&st, SIGINT);
+ sigaddset(&st, SIGTERM);
+
+ if (sigprocmask(SIG_BLOCK, &st, NULL) < 0)
+ return -errno;
+
+ sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK);
+ if (sample_sig_fd < 0)
+ return -errno;
+
+ sample_mask = mask;
+
+ return sample_setup_maps_mappings();
+}
+
+static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags)
+{
+ __u32 cur_prog_id = 0;
+ int ret;
+
+ if (prog_id) {
+ ret = bpf_xdp_query_id(ifindex, xdp_flags, &cur_prog_id);
+ if (ret < 0)
+ return -errno;
+
+ if (prog_id != cur_prog_id) {
+ print_always(
+ "Program on ifindex %d does not match installed "
+ "program, skipping unload\n",
+ ifindex);
+ return -ENOENT;
+ }
+ }
+
+ return bpf_xdp_detach(ifindex, xdp_flags, NULL);
+}
+
+int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
+ bool force)
+{
+ int ret, xdp_flags = 0;
+ __u32 prog_id = 0;
+
+ if (sample_xdp_cnt == 32) {
+ fprintf(stderr,
+ "Total limit for installed XDP programs in a sample reached\n");
+ return -ENOTSUP;
+ }
+
+ xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0;
+ xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE;
+ ret = bpf_xdp_attach(ifindex, bpf_program__fd(xdp_prog), xdp_flags, NULL);
+ if (ret < 0) {
+ ret = -errno;
+ fprintf(stderr,
+ "Failed to install program \"%s\" on ifindex %d, mode = %s, "
+ "force = %s: %s\n",
+ bpf_program__name(xdp_prog), ifindex,
+ generic ? "skb" : "native", force ? "true" : "false",
+ strerror(-ret));
+ return ret;
+ }
+
+ ret = bpf_xdp_query_id(ifindex, xdp_flags, &prog_id);
+ if (ret < 0) {
+ ret = -errno;
+ fprintf(stderr,
+ "Failed to get XDP program id for ifindex %d, removing program: %s\n",
+ ifindex, strerror(errno));
+ __sample_remove_xdp(ifindex, 0, xdp_flags);
+ return ret;
+ }
+ sample_xdp_progs[sample_xdp_cnt++] =
+ (struct xdp_desc){ ifindex, prog_id, xdp_flags };
+
+ return 0;
+}
+
+static void sample_summary_print(void)
+{
+ double num = sample_out.rx_cnt.num;
+
+ if (sample_out.totals.rx) {
+ double pkts = sample_out.totals.rx;
+
+ print_always(" Packets received : %'-10llu\n",
+ sample_out.totals.rx);
+ print_always(" Average packets/s : %'-10.0f\n",
+ sample_round(pkts / num));
+ }
+ if (sample_out.totals.redir) {
+ double pkts = sample_out.totals.redir;
+
+ print_always(" Packets redirected : %'-10llu\n",
+ sample_out.totals.redir);
+ print_always(" Average redir/s : %'-10.0f\n",
+ sample_round(pkts / num));
+ }
+ if (sample_out.totals.drop)
+ print_always(" Rx dropped : %'-10llu\n",
+ sample_out.totals.drop);
+ if (sample_out.totals.drop_xmit)
+ print_always(" Tx dropped : %'-10llu\n",
+ sample_out.totals.drop_xmit);
+ if (sample_out.totals.err)
+ print_always(" Errors recorded : %'-10llu\n",
+ sample_out.totals.err);
+ if (sample_out.totals.xmit) {
+ double pkts = sample_out.totals.xmit;
+
+ print_always(" Packets transmitted : %'-10llu\n",
+ sample_out.totals.xmit);
+ print_always(" Average transmit/s : %'-10.0f\n",
+ sample_round(pkts / num));
+ }
+}
+
+void sample_exit(int status)
+{
+ size_t size;
+
+ for (int i = 0; i < NUM_MAP; i++) {
+ size = sample_map_count[i] * sizeof(**sample_mmap);
+ munmap(sample_mmap[i], size);
+ }
+ while (sample_xdp_cnt--) {
+ int i = sample_xdp_cnt, ifindex, xdp_flags;
+ __u32 prog_id;
+
+ prog_id = sample_xdp_progs[i].prog_id;
+ ifindex = sample_xdp_progs[i].ifindex;
+ xdp_flags = sample_xdp_progs[i].flags;
+
+ __sample_remove_xdp(ifindex, prog_id, xdp_flags);
+ }
+ sample_summary_print();
+ close(sample_sig_fd);
+ exit(status);
+}
+
+static int sample_stats_collect(struct stats_record *rec)
+{
+ int i;
+
+ if (sample_mask & SAMPLE_RX_CNT)
+ map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt);
+
+ if (sample_mask & SAMPLE_REDIRECT_CNT)
+ map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]);
+
+ if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) {
+ for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++)
+ map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus],
+ &rec->redir_err[i]);
+ }
+
+ if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT)
+ for (i = 0; i < sample_n_cpus; i++)
+ map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus],
+ &rec->enq[i]);
+
+ if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT)
+ map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD],
+ &rec->kthread);
+
+ if (sample_mask & SAMPLE_EXCEPTION_CNT)
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus],
+ &rec->exception[i]);
+
+ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT)
+ map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit);
+
+ if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) {
+ if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void sample_summary_update(struct sample_output *out)
+{
+ sample_out.totals.rx += out->totals.rx;
+ sample_out.totals.redir += out->totals.redir;
+ sample_out.totals.drop += out->totals.drop;
+ sample_out.totals.drop_xmit += out->totals.drop_xmit;
+ sample_out.totals.err += out->totals.err;
+ sample_out.totals.xmit += out->totals.xmit;
+ sample_out.rx_cnt.num++;
+}
+
+static void sample_stats_print(int mask, struct stats_record *cur,
+ struct stats_record *prev, char *prog_name)
+{
+ struct sample_output out = {};
+
+ if (mask & SAMPLE_RX_CNT)
+ stats_get_rx_cnt(cur, prev, 0, &out);
+ if (mask & SAMPLE_REDIRECT_CNT)
+ stats_get_redirect_cnt(cur, prev, 0, &out);
+ if (mask & SAMPLE_REDIRECT_ERR_CNT)
+ stats_get_redirect_err_cnt(cur, prev, 0, &out);
+ if (mask & SAMPLE_EXCEPTION_CNT)
+ stats_get_exception_cnt(cur, prev, 0, &out);
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT)
+ stats_get_devmap_xmit(cur, prev, 0, &out);
+ else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
+ stats_get_devmap_xmit_multi(cur, prev, 0, &out,
+ mask & SAMPLE_DEVMAP_XMIT_CNT);
+ sample_summary_update(&out);
+
+ stats_print(prog_name, mask, cur, prev, &out);
+}
+
+void sample_switch_mode(void)
+{
+ sample_log_level ^= LL_DEBUG - 1;
+}
+
+static int sample_signal_cb(void)
+{
+ struct signalfd_siginfo si;
+ int r;
+
+ r = read(sample_sig_fd, &si, sizeof(si));
+ if (r < 0)
+ return -errno;
+
+ switch (si.ssi_signo) {
+ case SIGQUIT:
+ sample_switch_mode();
+ printf("\n");
+ break;
+ default:
+ printf("\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Pointer swap trick */
+static void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static int sample_timer_cb(int timerfd, struct stats_record **rec,
+ struct stats_record **prev)
+{
+ char line[64] = "Summary";
+ int ret;
+ __u64 t;
+
+ ret = read(timerfd, &t, sizeof(t));
+ if (ret < 0)
+ return -errno;
+
+ swap(prev, rec);
+ ret = sample_stats_collect(*rec);
+ if (ret < 0)
+ return ret;
+
+ if (sample_xdp_cnt == 2 && !(sample_mask & SAMPLE_SKIP_HEADING)) {
+ char fi[IFNAMSIZ];
+ char to[IFNAMSIZ];
+ const char *f, *t;
+
+ f = t = NULL;
+ if (if_indextoname(sample_xdp_progs[0].ifindex, fi))
+ f = fi;
+ if (if_indextoname(sample_xdp_progs[1].ifindex, to))
+ t = to;
+
+ snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?");
+ }
+
+ sample_stats_print(sample_mask, *rec, *prev, line);
+ return 0;
+}
+
+int sample_run(int interval, void (*post_cb)(void *), void *ctx)
+{
+ struct timespec ts = { interval, 0 };
+ struct itimerspec its = { ts, ts };
+ struct stats_record *rec, *prev;
+ struct pollfd pfd[2] = {};
+ int timerfd, ret;
+
+ if (!interval) {
+ fprintf(stderr, "Incorrect interval 0\n");
+ return -EINVAL;
+ }
+ sample_interval = interval;
+ /* Pretty print numbers */
+ setlocale(LC_NUMERIC, "en_US.UTF-8");
+
+ timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK);
+ if (timerfd < 0)
+ return -errno;
+ timerfd_settime(timerfd, 0, &its, NULL);
+
+ pfd[0].fd = sample_sig_fd;
+ pfd[0].events = POLLIN;
+
+ pfd[1].fd = timerfd;
+ pfd[1].events = POLLIN;
+
+ ret = -ENOMEM;
+ rec = alloc_stats_record();
+ if (!rec)
+ goto end;
+ prev = alloc_stats_record();
+ if (!prev)
+ goto end_rec;
+
+ ret = sample_stats_collect(rec);
+ if (ret < 0)
+ goto end_rec_prev;
+
+ for (;;) {
+ ret = poll(pfd, 2, -1);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ break;
+ }
+
+ if (pfd[0].revents & POLLIN)
+ ret = sample_signal_cb();
+ else if (pfd[1].revents & POLLIN)
+ ret = sample_timer_cb(timerfd, &rec, &prev);
+
+ if (ret)
+ break;
+
+ if (post_cb)
+ post_cb(ctx);
+ }
+
+end_rec_prev:
+ free_stats_record(prev);
+end_rec:
+ free_stats_record(rec);
+end:
+ close(timerfd);
+
+ return ret;
+}
+
+const char *get_driver_name(int ifindex)
+{
+ struct ethtool_drvinfo drv = {};
+ char ifname[IF_NAMESIZE];
+ static char drvname[32];
+ struct ifreq ifr = {};
+ int fd, r = 0;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0)
+ return "[error]";
+
+ if (!if_indextoname(ifindex, ifname))
+ goto end;
+
+ drv.cmd = ETHTOOL_GDRVINFO;
+ safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+ ifr.ifr_data = (void *)&drv;
+
+ r = ioctl(fd, SIOCETHTOOL, &ifr);
+ if (r)
+ goto end;
+
+ safe_strncpy(drvname, drv.driver, sizeof(drvname));
+
+ close(fd);
+ return drvname;
+
+end:
+ r = errno;
+ close(fd);
+ return r == EOPNOTSUPP ? "loopback" : "[error]";
+}
+
+int get_mac_addr(int ifindex, void *mac_addr)
+{
+ char ifname[IF_NAMESIZE];
+ struct ifreq ifr = {};
+ int fd, r;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0)
+ return -errno;
+
+ if (!if_indextoname(ifindex, ifname)) {
+ r = -errno;
+ goto end;
+ }
+
+ safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+ r = ioctl(fd, SIOCGIFHWADDR, &ifr);
+ if (r) {
+ r = -errno;
+ goto end;
+ }
+
+ memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+
+end:
+ close(fd);
+ return r;
+}
+
+__attribute__((constructor)) static void sample_ctor(void)
+{
+ if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) {
+ fprintf(stderr, "Failed to set libbpf strict mode: %s\n",
+ strerror(errno));
+ /* Just exit, nothing to cleanup right now */
+ exit(EXIT_FAIL_BPF);
+ }
+}
diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h
new file mode 100644
index 000000000000..f45051679977
--- /dev/null
+++ b/samples/bpf/xdp_sample_user.h
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef XDP_SAMPLE_USER_H
+#define XDP_SAMPLE_USER_H
+
+#include <bpf/libbpf.h>
+#include <linux/compiler.h>
+
+#include "xdp_sample_shared.h"
+
+enum stats_mask {
+ _SAMPLE_REDIRECT_MAP = 1U << 0,
+ SAMPLE_RX_CNT = 1U << 1,
+ SAMPLE_REDIRECT_ERR_CNT = 1U << 2,
+ SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3,
+ SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4,
+ SAMPLE_EXCEPTION_CNT = 1U << 5,
+ SAMPLE_DEVMAP_XMIT_CNT = 1U << 6,
+ SAMPLE_REDIRECT_CNT = 1U << 7,
+ SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP,
+ SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP,
+ SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8,
+ SAMPLE_SKIP_HEADING = 1U << 9,
+};
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+int sample_setup_maps(struct bpf_map **maps);
+int __sample_init(int mask);
+void sample_exit(int status);
+int sample_run(int interval, void (*post_cb)(void *), void *ctx);
+
+void sample_switch_mode(void);
+int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
+ bool force);
+void sample_usage(char *argv[], const struct option *long_options,
+ const char *doc, int mask, bool error);
+
+const char *get_driver_name(int ifindex);
+int get_mac_addr(int ifindex, void *mac_addr);
+
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
+#endif
+__attribute__((unused))
+static inline char *safe_strncpy(char *dst, const char *src, size_t size)
+{
+ if (!size)
+ return dst;
+ strncpy(dst, src, size - 1);
+ dst[size - 1] = '\0';
+ return dst;
+}
+#pragma GCC diagnostic pop
+
+#define __attach_tp(name) \
+ ({ \
+ if (bpf_program__type(skel->progs.name) != BPF_PROG_TYPE_TRACING)\
+ return -EINVAL; \
+ skel->links.name = bpf_program__attach(skel->progs.name); \
+ if (!skel->links.name) \
+ return -errno; \
+ })
+
+#define sample_init_pre_load(skel) \
+ ({ \
+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); \
+ sample_setup_maps((struct bpf_map *[]){ \
+ skel->maps.rx_cnt, skel->maps.redir_err_cnt, \
+ skel->maps.cpumap_enqueue_cnt, \
+ skel->maps.cpumap_kthread_cnt, \
+ skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt, \
+ skel->maps.devmap_xmit_cnt_multi }); \
+ })
+
+#define DEFINE_SAMPLE_INIT(name) \
+ static int sample_init(struct name *skel, int mask) \
+ { \
+ int ret; \
+ ret = __sample_init(mask); \
+ if (ret < 0) \
+ return ret; \
+ if (mask & SAMPLE_REDIRECT_MAP_CNT) \
+ __attach_tp(tp_xdp_redirect_map); \
+ if (mask & SAMPLE_REDIRECT_CNT) \
+ __attach_tp(tp_xdp_redirect); \
+ if (mask & SAMPLE_REDIRECT_ERR_MAP_CNT) \
+ __attach_tp(tp_xdp_redirect_map_err); \
+ if (mask & SAMPLE_REDIRECT_ERR_CNT) \
+ __attach_tp(tp_xdp_redirect_err); \
+ if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) \
+ __attach_tp(tp_xdp_cpumap_enqueue); \
+ if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) \
+ __attach_tp(tp_xdp_cpumap_kthread); \
+ if (mask & SAMPLE_EXCEPTION_CNT) \
+ __attach_tp(tp_xdp_exception); \
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT) \
+ __attach_tp(tp_xdp_devmap_xmit); \
+ if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) \
+ __attach_tp(tp_xdp_devmap_xmit_multi); \
+ return 0; \
+ }
+
+#endif
diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h
new file mode 100644
index 000000000000..be839892caff
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_common.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2016 Facebook
+ */
+#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 dport;
+ __u16 family;
+ __u8 protocol;
+};
+
+struct iptnl_info {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } saddr;
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 family;
+ __u8 dmac[6];
+};
+
+#endif
diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c
new file mode 100644
index 000000000000..0e2bca3a3fff
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_kern.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_head() by
+ * encapsulating the incoming packet in an IPv4/v6 header
+ * and then XDP_TX it out.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <bpf/bpf_helpers.h>
+#include "xdp_tx_iptunnel_common.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, __u32);
+ __type(value, __u64);
+ __uint(max_entries, 256);
+} rxcnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct vip);
+ __type(value, struct iptnl_info);
+ __uint(max_entries, MAX_IPTNL_ENTRIES);
+} vip2tnl SEC(".maps");
+
+static __always_inline void count_tx(u32 protocol)
+{
+ u64 *rxcnt_count;
+
+ rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+ if (rxcnt_count)
+ *rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, void *data_end,
+ u8 protocol)
+{
+ struct tcphdr *th;
+ struct udphdr *uh;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)trans_data;
+ if (th + 1 > data_end)
+ return -1;
+ return th->dest;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)trans_data;
+ if (uh + 1 > data_end)
+ return -1;
+ return uh->dest;
+ default:
+ return 0;
+ }
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+ const struct ethhdr *old_eth,
+ const struct iptnl_info *tnl,
+ __be16 h_proto)
+{
+ memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+ memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+ new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct iphdr *iph = data + sizeof(struct ethhdr);
+ u16 *next_iph_u16;
+ u16 payload_len;
+ struct vip vip = {};
+ int dport;
+ u32 csum = 0;
+ int i;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(iph + 1, data_end, iph->protocol);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = iph->protocol;
+ vip.family = AF_INET;
+ vip.daddr.v4 = iph->daddr;
+ vip.dport = dport;
+ payload_len = ntohs(iph->tot_len);
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v4-in-v4 */
+ if (!tnl || tnl->family != AF_INET)
+ return XDP_PASS;
+
+ /* The vip key is found. Add an IP header and send it out */
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ iph = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*iph);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ iph + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
+
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 0;
+ iph->tot_len = htons(payload_len + sizeof(*iph));
+ iph->daddr = tnl->daddr.v4;
+ iph->saddr = tnl->saddr.v4;
+ iph->ttl = 8;
+
+ next_iph_u16 = (u16 *)iph;
+#pragma clang loop unroll(full)
+ for (i = 0; i < sizeof(*iph) >> 1; i++)
+ csum += *next_iph_u16++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = ip6h->nexthdr;
+ vip.family = AF_INET6;
+ memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+ vip.dport = dport;
+ payload_len = ip6h->payload_len;
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v6-in-v6 */
+ if (!tnl || tnl->family != AF_INET6)
+ return XDP_PASS;
+
+ /* The vip key is found. Add an IP header and send it out */
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ ip6h = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*ip6h);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
+
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+ ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip6h->hop_limit = 8;
+ memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+ memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+SEC("xdp.frags")
+int _xdp_tx_iptunnel(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else if (h_proto == htons(ETH_P_IPV6))
+
+ return handle_ipv6(xdp);
+ else
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
new file mode 100644
index 000000000000..7e4b2f7108a6
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "bpf_util.h"
+#include "xdp_tx_iptunnel_common.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static int rxcnt_map_fd;
+static __u32 prog_id;
+
+static void int_exit(int sig)
+{
+ __u32 curr_prog_id = 0;
+
+ if (ifindex > -1) {
+ if (bpf_xdp_query_id(ifindex, xdp_flags, &curr_prog_id)) {
+ printf("bpf_xdp_query_id failed\n");
+ exit(1);
+ }
+ if (prog_id == curr_prog_id)
+ bpf_xdp_detach(ifindex, xdp_flags, NULL);
+ else if (!curr_prog_id)
+ printf("couldn't find a prog id on a given iface\n");
+ else
+ printf("program on interface changed, not removing\n");
+ }
+ exit(0);
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(unsigned int kill_after_s)
+{
+ const unsigned int nr_protos = 256;
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ time_t started_at = time(NULL);
+ __u64 values[nr_cpus], prev[nr_protos][nr_cpus];
+ __u32 proto;
+ int i;
+
+ memset(prev, 0, sizeof(prev));
+
+ while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+ sleep(STATS_INTERVAL_S);
+
+ for (proto = 0; proto < nr_protos; proto++) {
+ __u64 sum = 0;
+
+ assert(bpf_map_lookup_elem(rxcnt_map_fd, &proto,
+ values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[proto][i]);
+
+ if (sum)
+ printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n",
+ proto, sum, sum / STATS_INTERVAL_S);
+ memcpy(prev[proto], values, sizeof(values));
+ }
+ }
+}
+
+static void usage(const char *cmd)
+{
+ printf("Start a XDP prog which encapsulates incoming packets\n"
+ "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
+ "is used to select packets to encapsulate\n\n");
+ printf("Usage: %s [...]\n", cmd);
+ printf(" -i <ifname|ifindex> Interface\n");
+ printf(" -a <vip-service-address> IPv4 or IPv6\n");
+ printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
+ printf(" -s <source-ip> Used in the IPTunnel header\n");
+ printf(" -d <dest-ip> Used in the IPTunnel header\n");
+ printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
+ printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
+ printf(" -P <IP-Protocol> Default is TCP\n");
+ printf(" -S use skb-mode\n");
+ printf(" -N enforce native mode\n");
+ printf(" -F Force loading the XDP prog\n");
+ printf(" -h Display this help\n");
+}
+
+static int parse_ipstr(const char *ipstr, unsigned int *addr)
+{
+ if (inet_pton(AF_INET6, ipstr, addr) == 1) {
+ return AF_INET6;
+ } else if (inet_pton(AF_INET, ipstr, addr) == 1) {
+ addr[1] = addr[2] = addr[3] = 0;
+ return AF_INET;
+ }
+
+ fprintf(stderr, "%s is an invalid IP\n", ipstr);
+ return AF_UNSPEC;
+}
+
+static int parse_ports(const char *port_str, int *min_port, int *max_port)
+{
+ char *end;
+ long tmp_min_port;
+ long tmp_max_port;
+
+ tmp_min_port = strtol(optarg, &end, 10);
+ if (tmp_min_port < 1 || tmp_min_port > 65535) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+
+ if (*end == '-') {
+ end++;
+ tmp_max_port = strtol(end, NULL, 10);
+ if (tmp_max_port < 1 || tmp_max_port > 65535) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+ } else {
+ tmp_max_port = tmp_min_port;
+ }
+
+ if (tmp_min_port > tmp_max_port) {
+ fprintf(stderr, "Invalid port(s):%s\n", optarg);
+ return 1;
+ }
+
+ if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) {
+ fprintf(stderr, "Port range (%s) is larger than %u\n",
+ port_str, MAX_IPTNL_ENTRIES);
+ return 1;
+ }
+ *min_port = tmp_min_port;
+ *max_port = tmp_max_port;
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int min_port = 0, max_port = 0, vip2tnl_map_fd;
+ const char *optstr = "i:a:p:s:d:m:T:P:FSNh";
+ unsigned char opt_flags[256] = {};
+ struct bpf_prog_info info = {};
+ __u32 info_len = sizeof(info);
+ unsigned int kill_after_s = 0;
+ struct iptnl_info tnl = {};
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ struct vip vip = {};
+ char filename[256];
+ int opt, prog_fd;
+ int i, err;
+
+ tnl.family = AF_UNSPEC;
+ vip.protocol = IPPROTO_TCP;
+
+ for (i = 0; i < strlen(optstr); i++)
+ if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+ opt_flags[(unsigned char)optstr[i]] = 1;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ unsigned short family;
+ unsigned int *v6;
+
+ switch (opt) {
+ case 'i':
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
+ break;
+ case 'a':
+ vip.family = parse_ipstr(optarg, vip.daddr.v6);
+ if (vip.family == AF_UNSPEC)
+ return 1;
+ break;
+ case 'p':
+ if (parse_ports(optarg, &min_port, &max_port))
+ return 1;
+ break;
+ case 'P':
+ vip.protocol = atoi(optarg);
+ break;
+ case 's':
+ case 'd':
+ if (opt == 's')
+ v6 = tnl.saddr.v6;
+ else
+ v6 = tnl.daddr.v6;
+
+ family = parse_ipstr(optarg, v6);
+ if (family == AF_UNSPEC)
+ return 1;
+ if (tnl.family == AF_UNSPEC) {
+ tnl.family = family;
+ } else if (tnl.family != family) {
+ fprintf(stderr,
+ "The IP version of the src and dst addresses used in the IP encapsulation does not match\n");
+ return 1;
+ }
+ break;
+ case 'm':
+ if (!ether_aton_r(optarg,
+ (struct ether_addr *)tnl.dmac)) {
+ fprintf(stderr, "Invalid mac address:%s\n",
+ optarg);
+ return 1;
+ }
+ break;
+ case 'T':
+ kill_after_s = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ /* default, set below */
+ break;
+ case 'F':
+ xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ opt_flags[opt] = 0;
+ }
+
+ if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+ for (i = 0; i < strlen(optstr); i++) {
+ if (opt_flags[(unsigned int)optstr[i]]) {
+ fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ obj = bpf_object__open_file(filename, NULL);
+ if (libbpf_get_error(obj))
+ return 1;
+
+ prog = bpf_object__next_program(obj, NULL);
+ bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
+
+ err = bpf_object__load(obj);
+ if (err) {
+ printf("bpf_object__load(): %s\n", strerror(errno));
+ return 1;
+ }
+ prog_fd = bpf_program__fd(prog);
+
+ rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
+ vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl");
+ if (vip2tnl_map_fd < 0 || rxcnt_map_fd < 0) {
+ printf("bpf_object__find_map_fd_by_name failed\n");
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (min_port <= max_port) {
+ vip.dport = htons(min_port++);
+ if (bpf_map_update_elem(vip2tnl_map_fd, &vip, &tnl,
+ BPF_NOEXIST)) {
+ perror("bpf_map_update_elem(&vip2tnl)");
+ return 1;
+ }
+ }
+
+ if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
+ printf("link set xdp fd failed\n");
+ return 1;
+ }
+
+ err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
+ if (err) {
+ printf("can't get prog info - %s\n", strerror(errno));
+ return err;
+ }
+ prog_id = info.id;
+
+ poll_stats(kill_after_s);
+
+ bpf_xdp_detach(ifindex, xdp_flags, NULL);
+
+ return 0;
+}
diff --git a/samples/cgroup/.gitignore b/samples/cgroup/.gitignore
new file mode 100644
index 000000000000..3a0161194cce
--- /dev/null
+++ b/samples/cgroup/.gitignore
@@ -0,0 +1,3 @@
+/cgroup_event_listener
+/memcg_event_listener
+
diff --git a/samples/cgroup/Makefile b/samples/cgroup/Makefile
new file mode 100644
index 000000000000..526c8569707c
--- /dev/null
+++ b/samples/cgroup/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+userprogs-always-y += cgroup_event_listener memcg_event_listener
+
+userccflags += -I usr/include
diff --git a/samples/cgroup/cgroup_event_listener.c b/samples/cgroup/cgroup_event_listener.c
new file mode 100644
index 000000000000..3d70dc831a76
--- /dev/null
+++ b/samples/cgroup/cgroup_event_listener.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>"
+
+int main(int argc, char **argv)
+{
+ int efd = -1;
+ int cfd = -1;
+ int event_control = -1;
+ char event_control_path[PATH_MAX];
+ char line[LINE_MAX];
+ int ret;
+
+ if (argc != 3)
+ errx(1, "%s", USAGE_STR);
+
+ cfd = open(argv[1], O_RDONLY);
+ if (cfd == -1)
+ err(1, "Cannot open %s", argv[1]);
+
+ ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+ dirname(argv[1]));
+ if (ret >= PATH_MAX)
+ errx(1, "Path to cgroup.event_control is too long");
+
+ event_control = open(event_control_path, O_WRONLY);
+ if (event_control == -1)
+ err(1, "Cannot open %s", event_control_path);
+
+ efd = eventfd(0, 0);
+ if (efd == -1)
+ err(1, "eventfd() failed");
+
+ ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+ if (ret >= LINE_MAX)
+ errx(1, "Arguments string is too long");
+
+ ret = write(event_control, line, strlen(line) + 1);
+ if (ret == -1)
+ err(1, "Cannot write to cgroup.event_control");
+
+ while (1) {
+ uint64_t result;
+
+ ret = read(efd, &result, sizeof(result));
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ err(1, "Cannot read from eventfd");
+ }
+ assert(ret == sizeof(result));
+
+ ret = access(event_control_path, W_OK);
+ if ((ret == -1) && (errno == ENOENT)) {
+ puts("The cgroup seems to have removed.");
+ break;
+ }
+
+ if (ret == -1)
+ err(1, "cgroup.event_control is not accessible any more");
+
+ printf("%s %s: crossed\n", argv[1], argv[2]);
+ }
+
+ return 0;
+}
diff --git a/samples/cgroup/memcg_event_listener.c b/samples/cgroup/memcg_event_listener.c
new file mode 100644
index 000000000000..41425edbd88a
--- /dev/null
+++ b/samples/cgroup/memcg_event_listener.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * memcg_event_listener.c - Simple listener of memcg memory.events
+ *
+ * Copyright (c) 2023, SaluteDevices. All Rights Reserved.
+ *
+ * Author: Dmitry Rokosov <ddrokosov@salutedevices.com>
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/inotify.h>
+#include <unistd.h>
+
+/* Size of buffer to use when reading inotify events */
+#define INOTIFY_BUFFER_SIZE 8192
+
+#define INOTIFY_EVENT_NEXT(event, length) ({ \
+ (length) -= sizeof(*(event)) + (event)->len; \
+ (event)++; \
+})
+
+#define INOTIFY_EVENT_OK(event, length) ((length) >= (ssize_t)sizeof(*(event)))
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+struct memcg_counters {
+ long low;
+ long high;
+ long max;
+ long oom;
+ long oom_kill;
+ long oom_group_kill;
+};
+
+struct memcg_events {
+ struct memcg_counters counters;
+ char path[PATH_MAX];
+ int inotify_fd;
+ int inotify_wd;
+};
+
+static void print_memcg_counters(const struct memcg_counters *counters)
+{
+ printf("MEMCG events:\n");
+ printf("\tlow: %ld\n", counters->low);
+ printf("\thigh: %ld\n", counters->high);
+ printf("\tmax: %ld\n", counters->max);
+ printf("\toom: %ld\n", counters->oom);
+ printf("\toom_kill: %ld\n", counters->oom_kill);
+ printf("\toom_group_kill: %ld\n", counters->oom_group_kill);
+}
+
+static int get_memcg_counter(char *line, const char *name, long *counter)
+{
+ size_t len = strlen(name);
+ char *endptr;
+ long tmp;
+
+ if (memcmp(line, name, len)) {
+ warnx("Counter line %s has wrong name, %s is expected",
+ line, name);
+ return -EINVAL;
+ }
+
+ /* skip the whitespace delimiter */
+ len += 1;
+
+ errno = 0;
+ tmp = strtol(&line[len], &endptr, 10);
+ if (((tmp == LONG_MAX || tmp == LONG_MIN) && errno == ERANGE) ||
+ (errno && !tmp)) {
+ warnx("Failed to parse: %s", &line[len]);
+ return -ERANGE;
+ }
+
+ if (endptr == &line[len]) {
+ warnx("Not digits were found in line %s", &line[len]);
+ return -EINVAL;
+ }
+
+ if (!(*endptr == '\0' || (*endptr == '\n' && *++endptr == '\0'))) {
+ warnx("Further characters after number: %s", endptr);
+ return -EINVAL;
+ }
+
+ *counter = tmp;
+
+ return 0;
+}
+
+static int read_memcg_events(struct memcg_events *events, bool show_diff)
+{
+ FILE *fp = fopen(events->path, "re");
+ size_t i;
+ int ret = 0;
+ bool any_new_events = false;
+ char *line = NULL;
+ size_t len = 0;
+ struct memcg_counters new_counters;
+ struct memcg_counters *counters = &events->counters;
+ struct {
+ const char *name;
+ long *new;
+ long *old;
+ } map[] = {
+ {
+ .name = "low",
+ .new = &new_counters.low,
+ .old = &counters->low,
+ },
+ {
+ .name = "high",
+ .new = &new_counters.high,
+ .old = &counters->high,
+ },
+ {
+ .name = "max",
+ .new = &new_counters.max,
+ .old = &counters->max,
+ },
+ {
+ .name = "oom",
+ .new = &new_counters.oom,
+ .old = &counters->oom,
+ },
+ {
+ .name = "oom_kill",
+ .new = &new_counters.oom_kill,
+ .old = &counters->oom_kill,
+ },
+ {
+ .name = "oom_group_kill",
+ .new = &new_counters.oom_group_kill,
+ .old = &counters->oom_group_kill,
+ },
+ };
+
+ if (!fp) {
+ warn("Failed to open memcg events file %s", events->path);
+ return -EBADF;
+ }
+
+ /* Read new values for memcg counters */
+ for (i = 0; i < ARRAY_SIZE(map); ++i) {
+ ssize_t nread;
+
+ errno = 0;
+ nread = getline(&line, &len, fp);
+ if (nread == -1) {
+ if (errno) {
+ warn("Failed to read line for counter %s",
+ map[i].name);
+ ret = -EIO;
+ goto exit;
+ }
+
+ break;
+ }
+
+ ret = get_memcg_counter(line, map[i].name, map[i].new);
+ if (ret) {
+ warnx("Failed to get counter value from line %s", line);
+ goto exit;
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(map); ++i) {
+ long diff;
+
+ if (*map[i].new > *map[i].old) {
+ diff = *map[i].new - *map[i].old;
+
+ if (show_diff)
+ printf("*** %ld MEMCG %s event%s, "
+ "change counter %ld => %ld\n",
+ diff, map[i].name,
+ (diff == 1) ? "" : "s",
+ *map[i].old, *map[i].new);
+
+ *map[i].old += diff;
+ any_new_events = true;
+ }
+ }
+
+ if (show_diff && !any_new_events)
+ printf("*** No new untracked memcg events available\n");
+
+exit:
+ free(line);
+ fclose(fp);
+
+ return ret;
+}
+
+static void process_memcg_events(struct memcg_events *events,
+ struct inotify_event *event)
+{
+ int ret;
+
+ if (events->inotify_wd != event->wd) {
+ warnx("Unknown inotify event %d, should be %d", event->wd,
+ events->inotify_wd);
+ return;
+ }
+
+ printf("Received event in %s:\n", events->path);
+
+ if (!(event->mask & IN_MODIFY)) {
+ warnx("No IN_MODIFY event, skip it");
+ return;
+ }
+
+ ret = read_memcg_events(events, /* show_diff = */true);
+ if (ret)
+ warnx("Can't read memcg events");
+}
+
+static void monitor_events(struct memcg_events *events)
+{
+ struct pollfd fds[1];
+ int ret;
+
+ printf("Started monitoring memory events from '%s'...\n", events->path);
+
+ fds[0].fd = events->inotify_fd;
+ fds[0].events = POLLIN;
+
+ for (;;) {
+ ret = poll(fds, ARRAY_SIZE(fds), -1);
+ if (ret < 0 && errno != EAGAIN)
+ err(EXIT_FAILURE, "Can't poll memcg events (%d)", ret);
+
+ if (fds[0].revents & POLLERR)
+ err(EXIT_FAILURE, "Got POLLERR during monitor events");
+
+ if (fds[0].revents & POLLIN) {
+ struct inotify_event *event;
+ char buffer[INOTIFY_BUFFER_SIZE];
+ ssize_t length;
+
+ length = read(fds[0].fd, buffer, INOTIFY_BUFFER_SIZE);
+ if (length <= 0)
+ continue;
+
+ event = (struct inotify_event *)buffer;
+ while (INOTIFY_EVENT_OK(event, length)) {
+ process_memcg_events(events, event);
+ event = INOTIFY_EVENT_NEXT(event, length);
+ }
+ }
+ }
+}
+
+static int initialize_memcg_events(struct memcg_events *events,
+ const char *cgroup)
+{
+ int ret;
+
+ memset(events, 0, sizeof(struct memcg_events));
+
+ ret = snprintf(events->path, PATH_MAX,
+ "/sys/fs/cgroup/%s/memory.events", cgroup);
+ if (ret >= PATH_MAX) {
+ warnx("Path to cgroup memory.events is too long");
+ return -EMSGSIZE;
+ } else if (ret < 0) {
+ warn("Can't generate cgroup event full name");
+ return ret;
+ }
+
+ ret = read_memcg_events(events, /* show_diff = */false);
+ if (ret) {
+ warnx("Failed to read initial memcg events state (%d)", ret);
+ return ret;
+ }
+
+ events->inotify_fd = inotify_init();
+ if (events->inotify_fd < 0) {
+ warn("Failed to setup new inotify device");
+ return -EMFILE;
+ }
+
+ events->inotify_wd = inotify_add_watch(events->inotify_fd,
+ events->path, IN_MODIFY);
+ if (events->inotify_wd < 0) {
+ warn("Couldn't add monitor in dir %s", events->path);
+ return -EIO;
+ }
+
+ printf("Initialized MEMCG events with counters:\n");
+ print_memcg_counters(&events->counters);
+
+ return 0;
+}
+
+static void cleanup_memcg_events(struct memcg_events *events)
+{
+ inotify_rm_watch(events->inotify_fd, events->inotify_wd);
+ close(events->inotify_fd);
+}
+
+int main(int argc, const char **argv)
+{
+ struct memcg_events events;
+ ssize_t ret;
+
+ if (argc != 2)
+ errx(EXIT_FAILURE, "Usage: %s <cgroup>", argv[0]);
+
+ ret = initialize_memcg_events(&events, argv[1]);
+ if (ret)
+ errx(EXIT_FAILURE, "Can't initialize memcg events (%zd)", ret);
+
+ monitor_events(&events);
+
+ cleanup_memcg_events(&events);
+
+ printf("Exiting memcg event listener...\n");
+
+ return EXIT_SUCCESS;
+}
diff --git a/samples/check-exec/.gitignore b/samples/check-exec/.gitignore
new file mode 100644
index 000000000000..cd759a19dacd
--- /dev/null
+++ b/samples/check-exec/.gitignore
@@ -0,0 +1,2 @@
+/inc
+/set-exec
diff --git a/samples/check-exec/Makefile b/samples/check-exec/Makefile
new file mode 100644
index 000000000000..c4f08ad0f8e3
--- /dev/null
+++ b/samples/check-exec/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+userprogs-always-y := \
+ inc \
+ set-exec
+
+userccflags += -I usr/include
+
+.PHONY: all clean
+
+all:
+ $(MAKE) -C ../.. samples/check-exec/
+
+clean:
+ $(MAKE) -C ../.. M=samples/check-exec/ clean
diff --git a/samples/check-exec/inc.c b/samples/check-exec/inc.c
new file mode 100644
index 000000000000..7f6ef06a2f06
--- /dev/null
+++ b/samples/check-exec/inc.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Very simple script interpreter that can evaluate two different commands (one
+ * per line):
+ * - "?" to initialize a counter from user's input;
+ * - "+" to increment the counter (which is set to 0 by default).
+ *
+ * See tools/testing/selftests/exec/check-exec-tests.sh and
+ * Documentation/userspace-api/check_exec.rst
+ *
+ * Copyright © 2024 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/fcntl.h>
+#include <linux/prctl.h>
+#include <linux/securebits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+static int sys_execveat(int dirfd, const char *pathname, char *const argv[],
+ char *const envp[], int flags)
+{
+ return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags);
+}
+
+/* Returns 1 on error, 0 otherwise. */
+static int interpret_buffer(char *buffer, size_t buffer_size)
+{
+ char *line, *saveptr = NULL;
+ long long number = 0;
+
+ /* Each command is the first character of a line. */
+ saveptr = NULL;
+ line = strtok_r(buffer, "\n", &saveptr);
+ while (line) {
+ if (*line != '#' && strlen(line) != 1) {
+ fprintf(stderr, "# ERROR: Unknown string\n");
+ return 1;
+ }
+ switch (*line) {
+ case '#':
+ /* Skips shebang and comments. */
+ break;
+ case '+':
+ /* Increments and prints the number. */
+ number++;
+ printf("%lld\n", number);
+ break;
+ case '?':
+ /* Reads integer from stdin. */
+ fprintf(stderr, "> Enter new number: \n");
+ if (scanf("%lld", &number) != 1) {
+ fprintf(stderr,
+ "# WARNING: Failed to read number from stdin\n");
+ }
+ break;
+ default:
+ fprintf(stderr, "# ERROR: Unknown character '%c'\n",
+ *line);
+ return 1;
+ }
+ line = strtok_r(NULL, "\n", &saveptr);
+ }
+ return 0;
+}
+
+/* Returns 1 on error, 0 otherwise. */
+static int interpret_stream(FILE *script, char *const script_name,
+ char *const *const envp, const bool restrict_stream)
+{
+ int err;
+ char *const script_argv[] = { script_name, NULL };
+ char buf[128] = {};
+ size_t buf_size = sizeof(buf);
+
+ /*
+ * We pass a valid argv and envp to the kernel to emulate a native
+ * script execution. We must use the script file descriptor instead of
+ * the script path name to avoid race conditions.
+ */
+ err = sys_execveat(fileno(script), "", script_argv, envp,
+ AT_EMPTY_PATH | AT_EXECVE_CHECK);
+ if (err && restrict_stream) {
+ perror("ERROR: Script execution check");
+ return 1;
+ }
+
+ /* Reads script. */
+ buf_size = fread(buf, 1, buf_size - 1, script);
+ return interpret_buffer(buf, buf_size);
+}
+
+static void print_usage(const char *argv0)
+{
+ fprintf(stderr, "usage: %s <script.inc> | -i | -c <command>\n\n",
+ argv0);
+ fprintf(stderr, "Example:\n");
+ fprintf(stderr, " ./set-exec -fi -- ./inc -i < script-exec.inc\n");
+}
+
+int main(const int argc, char *const argv[], char *const *const envp)
+{
+ int opt;
+ char *cmd = NULL;
+ char *script_name = NULL;
+ bool interpret_stdin = false;
+ FILE *script_file = NULL;
+ int secbits;
+ bool deny_interactive, restrict_file;
+ size_t arg_nb;
+
+ secbits = prctl(PR_GET_SECUREBITS);
+ if (secbits == -1) {
+ /*
+ * This should never happen, except with a buggy seccomp
+ * filter.
+ */
+ perror("ERROR: Failed to get securebits");
+ return 1;
+ }
+
+ deny_interactive = !!(secbits & SECBIT_EXEC_DENY_INTERACTIVE);
+ restrict_file = !!(secbits & SECBIT_EXEC_RESTRICT_FILE);
+
+ while ((opt = getopt(argc, argv, "c:i")) != -1) {
+ switch (opt) {
+ case 'c':
+ if (cmd) {
+ fprintf(stderr, "ERROR: Command already set");
+ return 1;
+ }
+ cmd = optarg;
+ break;
+ case 'i':
+ interpret_stdin = true;
+ break;
+ default:
+ print_usage(argv[0]);
+ return 1;
+ }
+ }
+
+ /* Checks that only one argument is used, or read stdin. */
+ arg_nb = !!cmd + !!interpret_stdin;
+ if (arg_nb == 0 && argc == 2) {
+ script_name = argv[1];
+ } else if (arg_nb != 1) {
+ print_usage(argv[0]);
+ return 1;
+ }
+
+ if (cmd) {
+ /*
+ * Other kind of interactive interpretations should be denied
+ * as well (e.g. CLI arguments passing script snippets,
+ * environment variables interpreted as script). However, any
+ * way to pass script files should only be restricted according
+ * to restrict_file.
+ */
+ if (deny_interactive) {
+ fprintf(stderr,
+ "ERROR: Interactive interpretation denied.\n");
+ return 1;
+ }
+
+ return interpret_buffer(cmd, strlen(cmd));
+ }
+
+ if (interpret_stdin && !script_name) {
+ script_file = stdin;
+ /*
+ * As for any execve(2) call, this path may be logged by the
+ * kernel.
+ */
+ script_name = "/proc/self/fd/0";
+ /*
+ * When stdin is used, it can point to a regular file or a
+ * pipe. Restrict stdin execution according to
+ * SECBIT_EXEC_DENY_INTERACTIVE but always allow executable
+ * files (which are not considered as interactive inputs).
+ */
+ return interpret_stream(script_file, script_name, envp,
+ deny_interactive);
+ } else if (script_name && !interpret_stdin) {
+ /*
+ * In this sample, we don't pass any argument to scripts, but
+ * otherwise we would have to forge an argv with such
+ * arguments.
+ */
+ script_file = fopen(script_name, "r");
+ if (!script_file) {
+ perror("ERROR: Failed to open script");
+ return 1;
+ }
+ /*
+ * Restricts file execution according to
+ * SECBIT_EXEC_RESTRICT_FILE.
+ */
+ return interpret_stream(script_file, script_name, envp,
+ restrict_file);
+ }
+
+ print_usage(argv[0]);
+ return 1;
+}
diff --git a/samples/check-exec/run-script-ask.sh b/samples/check-exec/run-script-ask.sh
new file mode 100755
index 000000000000..8ef0fdc37266
--- /dev/null
+++ b/samples/check-exec/run-script-ask.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env sh
+# SPDX-License-Identifier: BSD-3-Clause
+
+DIR="$(dirname -- "$0")"
+
+PATH="${PATH}:${DIR}"
+
+set -x
+"${DIR}/script-ask.inc"
diff --git a/samples/check-exec/script-ask.inc b/samples/check-exec/script-ask.inc
new file mode 100755
index 000000000000..720a8e649225
--- /dev/null
+++ b/samples/check-exec/script-ask.inc
@@ -0,0 +1,5 @@
+#!/usr/bin/env inc
+# SPDX-License-Identifier: BSD-3-Clause
+
+?
++
diff --git a/samples/check-exec/script-exec.inc b/samples/check-exec/script-exec.inc
new file mode 100755
index 000000000000..3245cb9d8dd1
--- /dev/null
+++ b/samples/check-exec/script-exec.inc
@@ -0,0 +1,4 @@
+#!/usr/bin/env inc
+# SPDX-License-Identifier: BSD-3-Clause
+
++
diff --git a/samples/check-exec/script-noexec.inc b/samples/check-exec/script-noexec.inc
new file mode 100644
index 000000000000..3245cb9d8dd1
--- /dev/null
+++ b/samples/check-exec/script-noexec.inc
@@ -0,0 +1,4 @@
+#!/usr/bin/env inc
+# SPDX-License-Identifier: BSD-3-Clause
+
++
diff --git a/samples/check-exec/set-exec.c b/samples/check-exec/set-exec.c
new file mode 100644
index 000000000000..ba86a60a20dd
--- /dev/null
+++ b/samples/check-exec/set-exec.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Simple tool to set SECBIT_EXEC_RESTRICT_FILE, SECBIT_EXEC_DENY_INTERACTIVE,
+ * before executing a command.
+ *
+ * Copyright © 2024 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <linux/prctl.h>
+#include <linux/securebits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+static void print_usage(const char *argv0)
+{
+ fprintf(stderr, "usage: %s -f|-i -- <cmd> [args]...\n\n", argv0);
+ fprintf(stderr, "Execute a command with\n");
+ fprintf(stderr, "- SECBIT_EXEC_RESTRICT_FILE set: -f\n");
+ fprintf(stderr, "- SECBIT_EXEC_DENY_INTERACTIVE set: -i\n");
+}
+
+int main(const int argc, char *const argv[], char *const *const envp)
+{
+ const char *cmd_path;
+ char *const *cmd_argv;
+ int opt, secbits_cur, secbits_new;
+ bool has_policy = false;
+
+ secbits_cur = prctl(PR_GET_SECUREBITS);
+ if (secbits_cur == -1) {
+ /*
+ * This should never happen, except with a buggy seccomp
+ * filter.
+ */
+ perror("ERROR: Failed to get securebits");
+ return 1;
+ }
+
+ secbits_new = secbits_cur;
+ while ((opt = getopt(argc, argv, "fi")) != -1) {
+ switch (opt) {
+ case 'f':
+ secbits_new |= SECBIT_EXEC_RESTRICT_FILE |
+ SECBIT_EXEC_RESTRICT_FILE_LOCKED;
+ has_policy = true;
+ break;
+ case 'i':
+ secbits_new |= SECBIT_EXEC_DENY_INTERACTIVE |
+ SECBIT_EXEC_DENY_INTERACTIVE_LOCKED;
+ has_policy = true;
+ break;
+ default:
+ print_usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (!argv[optind] || !has_policy) {
+ print_usage(argv[0]);
+ return 1;
+ }
+
+ if (secbits_cur != secbits_new &&
+ prctl(PR_SET_SECUREBITS, secbits_new)) {
+ perror("Failed to set secure bit(s).");
+ fprintf(stderr,
+ "Hint: The running kernel may not support this feature.\n");
+ return 1;
+ }
+
+ cmd_path = argv[optind];
+ cmd_argv = argv + optind;
+ fprintf(stderr, "Executing command...\n");
+ execvpe(cmd_path, cmd_argv, envp);
+ fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path,
+ strerror(errno));
+ return 1;
+}
diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile
new file mode 100644
index 000000000000..92d661fcba6d
--- /dev/null
+++ b/samples/configfs/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
new file mode 100644
index 000000000000..fd5d163828c5
--- /dev/null
+++ b/samples/configfs/configfs_sample.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * configfs_example_macros.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It uses the helper
+ * macros defined by configfs.h
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/configfs.h>
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return container_of(to_configfs_subsystem(to_config_group(item)),
+ struct childless, subsys);
+}
+
+static ssize_t childless_showme_show(struct config_item *item, char *page)
+{
+ struct childless *childless = to_childless(item);
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", to_childless(item)->storeme);
+}
+
+static ssize_t childless_storeme_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct childless *childless = to_childless(item);
+ int ret;
+
+ ret = kstrtoint(page, 10, &childless->storeme);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t childless_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+CONFIGFS_ATTR_RO(childless_, showme);
+CONFIGFS_ATTR(childless_, storeme);
+CONFIGFS_ATTR_RO(childless_, description);
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme,
+ &childless_attr_storeme,
+ &childless_attr_description,
+ NULL,
+};
+
+static const struct config_item_type childless_type = {
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return container_of(item, struct simple_child, item);
+}
+
+static ssize_t simple_child_storeme_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", to_simple_child(item)->storeme);
+}
+
+static ssize_t simple_child_storeme_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ int ret;
+
+ ret = kstrtoint(page, 10, &simple_child->storeme);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+CONFIGFS_ATTR(simple_child_, storeme);
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+};
+
+static const struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return container_of(to_config_group(item),
+ struct simple_children, group);
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group,
+ const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ return &simple_child->item;
+}
+
+static ssize_t simple_children_description_show(struct config_item *item,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+CONFIGFS_ATTR_RO(simple_children_, description);
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static const struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(
+ struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static ssize_t group_children_description_show(struct config_item *item,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+CONFIGFS_ATTR_RO(group_children_, description);
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static const struct config_item_type group_children_type = {
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ struct configfs_subsystem *subsys;
+ int ret, i;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n",
+ ret, subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (i--; i >= 0; i--)
+ configfs_unregister_subsystem(example_subsys[i]);
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++)
+ configfs_unregister_subsystem(example_subsys[i]);
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_DESCRIPTION("Sample configfs module");
+MODULE_LICENSE("GPL");
diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore
new file mode 100644
index 000000000000..0e26039f39b5
--- /dev/null
+++ b/samples/connector/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/ucon
diff --git a/samples/connector/Makefile b/samples/connector/Makefile
new file mode 100644
index 000000000000..d98a9e047c11
--- /dev/null
+++ b/samples/connector/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o
+
+userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon
+
+userccflags += -I usr/include
diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c
new file mode 100644
index 000000000000..73d50b4aebb6
--- /dev/null
+++ b/samples/connector/cn_test.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * cn_test.c
+ *
+ * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
+ * All rights reserved.
+ */
+
+#define pr_fmt(fmt) "cn_test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+
+#include <linux/connector.h>
+
+static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 };
+static char cn_test_name[] = "cn_test";
+static struct sock *nls;
+static struct timer_list cn_test_timer;
+
+static void cn_test_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+ pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n",
+ __func__, jiffies, msg->id.idx, msg->id.val,
+ msg->seq, msg->ack, msg->len,
+ msg->len ? (char *)msg->data : "");
+}
+
+/*
+ * Do not remove this function even if no one is using it as
+ * this is an example of how to get notifications about new
+ * connector user registration
+ */
+#if 0
+static int cn_test_want_notify(void)
+{
+ struct cn_ctl_msg *ctl;
+ struct cn_notify_req *req;
+ struct cn_msg *msg = NULL;
+ int size, size0;
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ u32 group = 1;
+
+ size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req);
+
+ size = NLMSG_SPACE(size0);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb) {
+ pr_err("failed to allocate new skb with size=%u\n", size);
+ return -ENOMEM;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh), 0);
+ if (!nlh) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ msg = nlmsg_data(nlh);
+
+ memset(msg, 0, size0);
+
+ msg->id.idx = -1;
+ msg->id.val = -1;
+ msg->seq = 0x123;
+ msg->ack = 0x345;
+ msg->len = size0 - sizeof(*msg);
+
+ ctl = (struct cn_ctl_msg *)(msg + 1);
+
+ ctl->idx_notify_num = 1;
+ ctl->val_notify_num = 2;
+ ctl->group = group;
+ ctl->len = msg->len - sizeof(*ctl);
+
+ req = (struct cn_notify_req *)(ctl + 1);
+
+ /*
+ * Idx.
+ */
+ req->first = cn_test_id.idx;
+ req->range = 10;
+
+ /*
+ * Val 0.
+ */
+ req++;
+ req->first = cn_test_id.val;
+ req->range = 10;
+
+ /*
+ * Val 1.
+ */
+ req++;
+ req->first = cn_test_id.val + 20;
+ req->range = 10;
+
+ NETLINK_CB(skb).dst_group = ctl->group;
+ //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC);
+ netlink_unicast(nls, skb, 0, 0);
+
+ pr_info("request was sent: group=0x%x\n", ctl->group);
+
+ return 0;
+}
+#endif
+
+static u32 cn_test_timer_counter;
+static void cn_test_timer_func(struct timer_list *unused)
+{
+ struct cn_msg *m;
+ char data[32];
+
+ pr_debug("%s: timer fired\n", __func__);
+
+ m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
+ if (m) {
+
+ memcpy(&m->id, &cn_test_id, sizeof(m->id));
+ m->seq = cn_test_timer_counter;
+ m->len = sizeof(data);
+
+ m->len =
+ scnprintf(data, sizeof(data), "counter = %u",
+ cn_test_timer_counter) + 1;
+
+ memcpy(m + 1, data, m->len);
+
+ cn_netlink_send(m, 0, 0, GFP_ATOMIC);
+ kfree(m);
+ }
+
+ cn_test_timer_counter++;
+
+ mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
+}
+
+static int cn_test_init(void)
+{
+ int err;
+
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err)
+ goto err_out;
+ cn_test_id.val++;
+ err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
+ if (err) {
+ cn_del_callback(&cn_test_id);
+ goto err_out;
+ }
+
+ timer_setup(&cn_test_timer, cn_test_timer_func, 0);
+ mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
+
+ pr_info("initialized with id={%u.%u}\n",
+ cn_test_id.idx, cn_test_id.val);
+
+ return 0;
+
+ err_out:
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+
+ return err;
+}
+
+static void cn_test_fini(void)
+{
+ timer_delete_sync(&cn_test_timer);
+ cn_del_callback(&cn_test_id);
+ cn_test_id.val--;
+ cn_del_callback(&cn_test_id);
+ if (nls && nls->sk_socket)
+ sock_release(nls->sk_socket);
+}
+
+module_init(cn_test_init);
+module_exit(cn_test_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_DESCRIPTION("Connector's test module");
diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c
new file mode 100644
index 000000000000..fa17f864200e
--- /dev/null
+++ b/samples/connector/ucon.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ucon.c
+ *
+ * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net>
+ */
+
+#include <asm/types.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <arpa/inet.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <getopt.h>
+
+#include <linux/connector.h>
+
+#define DEBUG
+#define NETLINK_CONNECTOR 11
+
+/* Hopefully your userspace connector.h matches this kernel */
+#define CN_TEST_IDX CN_NETLINK_USERS + 3
+#define CN_TEST_VAL 0x456
+
+#ifdef DEBUG
+#define ulog(f, a...) fprintf(stdout, f, ##a)
+#else
+#define ulog(f, a...) do {} while (0)
+#endif
+
+static int need_exit;
+static __u32 seq;
+
+static int netlink_send(int s, struct cn_msg *msg)
+{
+ struct nlmsghdr *nlh;
+ unsigned int size;
+ int err;
+ char buf[128];
+ struct cn_msg *m;
+
+ size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len);
+
+ nlh = (struct nlmsghdr *)buf;
+ nlh->nlmsg_seq = seq++;
+ nlh->nlmsg_pid = getpid();
+ nlh->nlmsg_type = NLMSG_DONE;
+ nlh->nlmsg_len = size;
+ nlh->nlmsg_flags = 0;
+
+ m = NLMSG_DATA(nlh);
+#if 0
+ ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n",
+ __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack);
+#endif
+ memcpy(m, msg, sizeof(*m) + msg->len);
+
+ err = send(s, nlh, size, 0);
+ if (err == -1)
+ ulog("Failed to send: %s [%d].\n",
+ strerror(errno), errno);
+
+ return err;
+}
+
+static void usage(void)
+{
+ printf(
+ "Usage: ucon [options] [output file]\n"
+ "\n"
+ "\t-h\tthis help screen\n"
+ "\t-s\tsend buffers to the test module\n"
+ "\n"
+ "The default behavior of ucon is to subscribe to the test module\n"
+ "and wait for state messages. Any ones received are dumped to the\n"
+ "specified output file (or stdout). The test module is assumed to\n"
+ "have an id of {%u.%u}\n"
+ "\n"
+ "If you get no output, then verify the cn_test module id matches\n"
+ "the expected id above.\n"
+ , CN_TEST_IDX, CN_TEST_VAL
+ );
+}
+
+int main(int argc, char *argv[])
+{
+ int s;
+ char buf[1024];
+ int len;
+ struct nlmsghdr *reply;
+ struct sockaddr_nl l_local;
+ struct cn_msg *data;
+ FILE *out;
+ time_t tm;
+ struct pollfd pfd;
+ bool send_msgs = false;
+
+ while ((s = getopt(argc, argv, "hs")) != -1) {
+ switch (s) {
+ case 's':
+ send_msgs = true;
+ break;
+
+ case 'h':
+ usage();
+ return 0;
+
+ default:
+ /* getopt() outputs an error for us */
+ usage();
+ return 1;
+ }
+ }
+
+ if (argc != optind) {
+ out = fopen(argv[optind], "a+");
+ if (!out) {
+ ulog("Unable to open %s for writing: %s\n",
+ argv[1], strerror(errno));
+ out = stdout;
+ }
+ } else
+ out = stdout;
+
+ memset(buf, 0, sizeof(buf));
+
+ s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ if (s == -1) {
+ perror("socket");
+ return -1;
+ }
+
+ l_local.nl_family = AF_NETLINK;
+ l_local.nl_groups = -1; /* bitmask of requested groups */
+ l_local.nl_pid = 0;
+
+ ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL);
+
+ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+ }
+
+#if 0
+ {
+ int on = 0x57; /* Additional group number */
+ setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on));
+ }
+#endif
+ if (send_msgs) {
+ int i, j;
+
+ memset(buf, 0, sizeof(buf));
+
+ data = (struct cn_msg *)buf;
+
+ data->id.idx = CN_TEST_IDX;
+ data->id.val = CN_TEST_VAL;
+ data->seq = seq++;
+ data->ack = 0;
+ data->len = 0;
+
+ for (j=0; j<10; ++j) {
+ for (i=0; i<1000; ++i) {
+ len = netlink_send(s, data);
+ }
+
+ ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val);
+ }
+
+ return 0;
+ }
+
+
+ pfd.fd = s;
+
+ while (!need_exit) {
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ switch (poll(&pfd, 1, -1)) {
+ case 0:
+ need_exit = 1;
+ break;
+ case -1:
+ if (errno != EINTR) {
+ need_exit = 1;
+ break;
+ }
+ continue;
+ }
+ if (need_exit)
+ break;
+
+ memset(buf, 0, sizeof(buf));
+ len = recv(s, buf, sizeof(buf), 0);
+ if (len == -1) {
+ perror("recv buf");
+ close(s);
+ return -1;
+ }
+ reply = (struct nlmsghdr *)buf;
+
+ switch (reply->nlmsg_type) {
+ case NLMSG_ERROR:
+ fprintf(out, "Error message received.\n");
+ fflush(out);
+ break;
+ case NLMSG_DONE:
+ data = (struct cn_msg *)NLMSG_DATA(reply);
+
+ time(&tm);
+ fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n",
+ ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack);
+ fflush(out);
+ break;
+ default:
+ break;
+ }
+ }
+
+ close(s);
+ return 0;
+}
diff --git a/samples/coresight/Makefile b/samples/coresight/Makefile
new file mode 100644
index 000000000000..b3fce4af2347
--- /dev/null
+++ b/samples/coresight/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight-cfg-sample.o
+ccflags-y += -I$(srctree)/drivers/hwtracing/coresight
diff --git a/samples/coresight/coresight-cfg-sample.c b/samples/coresight/coresight-cfg-sample.c
new file mode 100644
index 000000000000..25485c80b5e3
--- /dev/null
+++ b/samples/coresight/coresight-cfg-sample.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright(C) 2020 Linaro Limited. All rights reserved.
+ * Author: Mike Leach <mike.leach@linaro.org>
+ */
+
+#include "coresight-config.h"
+#include "coresight-syscfg.h"
+
+/* create an alternate autofdo configuration */
+
+/* we will provide 4 sets of preset parameter values */
+#define AFDO2_NR_PRESETS 4
+/* the total number of parameters in used features - strobing has 2 */
+#define AFDO2_NR_PARAM_SUM 2
+
+static const char *afdo2_ref_names[] = {
+ "strobing",
+};
+
+/*
+ * set of presets leaves strobing window constant while varying period to allow
+ * experimentation with mark / space ratios for various workloads
+ */
+static u64 afdo2_presets[AFDO2_NR_PRESETS][AFDO2_NR_PARAM_SUM] = {
+ { 1000, 100 },
+ { 1000, 1000 },
+ { 1000, 5000 },
+ { 1000, 10000 },
+};
+
+struct cscfg_config_desc afdo2 = {
+ .name = "autofdo2",
+ .description = "Setup ETMs with strobing for autofdo\n"
+ "Supplied presets allow experimentation with mark-space ratio for various loads\n",
+ .nr_feat_refs = ARRAY_SIZE(afdo2_ref_names),
+ .feat_ref_names = afdo2_ref_names,
+ .nr_presets = AFDO2_NR_PRESETS,
+ .nr_total_params = AFDO2_NR_PARAM_SUM,
+ .presets = &afdo2_presets[0][0],
+};
+
+static struct cscfg_feature_desc *sample_feats[] = {
+ NULL
+};
+
+static struct cscfg_config_desc *sample_cfgs[] = {
+ &afdo2,
+ NULL
+};
+
+static struct cscfg_load_owner_info mod_owner = {
+ .type = CSCFG_OWNER_MODULE,
+ .owner_handle = THIS_MODULE,
+};
+
+/* module init and exit - just load and unload configs */
+static int __init cscfg_sample_init(void)
+{
+ return cscfg_load_config_sets(sample_cfgs, sample_feats, &mod_owner);
+}
+
+static void __exit cscfg_sample_exit(void)
+{
+ cscfg_unload_config_sets(&mod_owner);
+}
+
+module_init(cscfg_sample_init);
+module_exit(cscfg_sample_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Mike Leach <mike.leach@linaro.org>");
+MODULE_DESCRIPTION("CoreSight Syscfg Example");
diff --git a/samples/damon/Kconfig b/samples/damon/Kconfig
new file mode 100644
index 000000000000..cbf96fd8a8bf
--- /dev/null
+++ b/samples/damon/Kconfig
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "DAMON Samples"
+
+config SAMPLE_DAMON_WSSE
+ bool "DAMON sample module for working set size estimation"
+ depends on DAMON && DAMON_VADDR
+ help
+ This builds DAMON sample module for working set size estimation.
+
+ The module receives a pid, monitor access to the virtual address
+ space of the process, estimate working set size of the process, and
+ repeatedly prints the size on the kernel log.
+
+ If unsure, say N.
+
+config SAMPLE_DAMON_PRCL
+ bool "DAMON sample module for access-aware proactive reclamation"
+ depends on DAMON && DAMON_VADDR
+ help
+ This builds DAMON sample module for access-aware proactive
+ reclamation.
+
+ The module receives a pid, monitor access to the virtual address
+ space of the process, find memory regions that not accessed, and
+ proactively reclaim the regions.
+
+ If unsure, say N.
+
+config SAMPLE_DAMON_MTIER
+ bool "DAMON sample module for memory tiering"
+ depends on DAMON && DAMON_PADDR
+ help
+ Thps builds DAMON sample module for memory tierign.
+
+ The module assumes the system is constructed with two NUMA nodes,
+ which seems as local and remote nodes to all CPUs. For example,
+ node0 is for DDR5 DRAMs connected via DIMM, while node1 is for DDR4
+ DRAMs connected via CXL.
+
+ If unsure, say N.
+
+endmenu
diff --git a/samples/damon/Makefile b/samples/damon/Makefile
new file mode 100644
index 000000000000..72f68cbf422a
--- /dev/null
+++ b/samples/damon/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_SAMPLE_DAMON_WSSE) += wsse.o
+obj-$(CONFIG_SAMPLE_DAMON_PRCL) += prcl.o
+obj-$(CONFIG_SAMPLE_DAMON_MTIER) += mtier.o
diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c
new file mode 100644
index 000000000000..775838a23d93
--- /dev/null
+++ b/samples/damon/mtier.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * memory tiering: migrate cold pages in node 0 and hot pages in node 1 to node
+ * 1 and node 0, respectively. Adjust the hotness/coldness threshold aiming
+ * resulting 99.6 % node 0 utilization ratio.
+ */
+
+#define pr_fmt(fmt) "damon_sample_mtier: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_sample_mtier."
+
+static unsigned long node0_start_addr __read_mostly;
+module_param(node0_start_addr, ulong, 0600);
+
+static unsigned long node0_end_addr __read_mostly;
+module_param(node0_end_addr, ulong, 0600);
+
+static unsigned long node1_start_addr __read_mostly;
+module_param(node1_start_addr, ulong, 0600);
+
+static unsigned long node1_end_addr __read_mostly;
+module_param(node1_end_addr, ulong, 0600);
+
+static unsigned long node0_mem_used_bp __read_mostly = 9970;
+module_param(node0_mem_used_bp, ulong, 0600);
+
+static unsigned long node0_mem_free_bp __read_mostly = 50;
+module_param(node0_mem_free_bp, ulong, 0600);
+
+static int damon_sample_mtier_enable_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_sample_mtier_enable_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly;
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_MTIER");
+
+static bool detect_node_addresses __read_mostly;
+module_param(detect_node_addresses, bool, 0600);
+
+static struct damon_ctx *ctxs[2];
+
+struct region_range {
+ phys_addr_t start;
+ phys_addr_t end;
+};
+
+static int nid_to_phys(int target_node, struct region_range *range)
+{
+ if (!node_online(target_node)) {
+ pr_err("NUMA node %d is not online\n", target_node);
+ return -EINVAL;
+ }
+
+ range->start = PFN_PHYS(node_start_pfn(target_node));
+ range->end = PFN_PHYS(node_end_pfn(target_node));
+
+ return 0;
+}
+
+static struct damon_ctx *damon_sample_mtier_build_ctx(bool promote)
+{
+ struct damon_ctx *ctx;
+ struct damon_attrs attrs;
+ struct damon_target *target;
+ struct damon_region *region;
+ struct damos *scheme;
+ struct damos_quota_goal *quota_goal;
+ struct damos_filter *filter;
+ struct region_range addr;
+ int ret;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return NULL;
+ attrs = (struct damon_attrs) {
+ .sample_interval = 5 * USEC_PER_MSEC,
+ .aggr_interval = 100 * USEC_PER_MSEC,
+ .ops_update_interval = 60 * USEC_PER_MSEC * MSEC_PER_SEC,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+ };
+
+ /*
+ * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
+ * accesses ratio, keeping sampling interval in [5ms, 10s] range.
+ */
+ attrs.intervals_goal = (struct damon_intervals_goal) {
+ .access_bp = 400, .aggrs = 3,
+ .min_sample_us = 5000, .max_sample_us = 10000000,
+ };
+ if (damon_set_attrs(ctx, &attrs))
+ goto free_out;
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ goto free_out;
+
+ target = damon_new_target();
+ if (!target)
+ goto free_out;
+ damon_add_target(ctx, target);
+
+ if (detect_node_addresses) {
+ ret = promote ? nid_to_phys(1, &addr) : nid_to_phys(0, &addr);
+ if (ret)
+ goto free_out;
+ } else {
+ addr.start = promote ? node1_start_addr : node0_start_addr;
+ addr.end = promote ? node1_end_addr : node0_end_addr;
+ }
+
+ region = damon_new_region(addr.start, addr.end);
+ if (!region)
+ goto free_out;
+ damon_add_region(region, target);
+
+ scheme = damon_new_scheme(
+ /* access pattern */
+ &(struct damos_access_pattern) {
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ .min_nr_accesses = promote ? 1 : 0,
+ .max_nr_accesses = promote ? UINT_MAX : 0,
+ .min_age_region = 0,
+ .max_age_region = UINT_MAX},
+ /* action */
+ promote ? DAMOS_MIGRATE_HOT : DAMOS_MIGRATE_COLD,
+ 1000000, /* apply interval (1s) */
+ &(struct damos_quota){
+ /* 200 MiB per sec by most */
+ .reset_interval = 1000,
+ .sz = 200 * 1024 * 1024,
+ /* ignore size of region when prioritizing */
+ .weight_sz = 0,
+ .weight_nr_accesses = 100,
+ .weight_age = 100,
+ },
+ &(struct damos_watermarks){},
+ promote ? 0 : 1); /* migrate target node id */
+ if (!scheme)
+ goto free_out;
+ damon_set_schemes(ctx, &scheme, 1);
+ quota_goal = damos_new_quota_goal(
+ promote ? DAMOS_QUOTA_NODE_MEM_USED_BP :
+ DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ promote ? node0_mem_used_bp : node0_mem_free_bp);
+ if (!quota_goal)
+ goto free_out;
+ quota_goal->nid = 0;
+ damos_add_quota_goal(&scheme->quota, quota_goal);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true, promote);
+ if (!filter)
+ goto free_out;
+ damos_add_filter(scheme, filter);
+ return ctx;
+free_out:
+ damon_destroy_ctx(ctx);
+ return NULL;
+}
+
+static int damon_sample_mtier_start(void)
+{
+ struct damon_ctx *ctx;
+
+ ctx = damon_sample_mtier_build_ctx(true);
+ if (!ctx)
+ return -ENOMEM;
+ ctxs[0] = ctx;
+ ctx = damon_sample_mtier_build_ctx(false);
+ if (!ctx) {
+ damon_destroy_ctx(ctxs[0]);
+ return -ENOMEM;
+ }
+ ctxs[1] = ctx;
+ return damon_start(ctxs, 2, true);
+}
+
+static void damon_sample_mtier_stop(void)
+{
+ damon_stop(ctxs, 2);
+ damon_destroy_ctx(ctxs[0]);
+ damon_destroy_ctx(ctxs[1]);
+}
+
+static int damon_sample_mtier_enable_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (enabled == is_enabled)
+ return 0;
+
+ if (!damon_initialized())
+ return 0;
+
+ if (enabled) {
+ err = damon_sample_mtier_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_sample_mtier_stop();
+ return 0;
+}
+
+static int __init damon_sample_mtier_init(void)
+{
+ int err = 0;
+
+ if (!damon_initialized()) {
+ if (enabled)
+ enabled = false;
+ return -ENOMEM;
+ }
+
+ if (enabled) {
+ err = damon_sample_mtier_start();
+ if (err)
+ enabled = false;
+ }
+ return 0;
+}
+
+module_init(damon_sample_mtier_init);
diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c
new file mode 100644
index 000000000000..b7c50f2656ce
--- /dev/null
+++ b/samples/damon/prcl.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * proactive reclamation: monitor access pattern of a given process, find
+ * regions that seems not accessed, and proactively page out the regions.
+ */
+
+#define pr_fmt(fmt) "damon_sample_prcl: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_sample_prcl."
+
+static int target_pid __read_mostly;
+module_param(target_pid, int, 0600);
+
+static int damon_sample_prcl_enable_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_sample_prcl_enable_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly;
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_PRCL");
+
+static struct damon_ctx *ctx;
+static struct pid *target_pidp;
+
+static int damon_sample_prcl_repeat_call_fn(void *data)
+{
+ struct damon_ctx *c = data;
+ struct damon_target *t;
+
+ damon_for_each_target(t, c) {
+ struct damon_region *r;
+ unsigned long wss = 0;
+
+ damon_for_each_region(r, t) {
+ if (r->nr_accesses > 0)
+ wss += r->ar.end - r->ar.start;
+ }
+ pr_info("wss: %lu\n", wss);
+ }
+ return 0;
+}
+
+static struct damon_call_control repeat_call_control = {
+ .fn = damon_sample_prcl_repeat_call_fn,
+ .repeat = true,
+};
+
+static int damon_sample_prcl_start(void)
+{
+ struct damon_target *target;
+ struct damos *scheme;
+ int err;
+
+ pr_info("start\n");
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+ if (damon_select_ops(ctx, DAMON_OPS_VADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+ target_pidp = find_get_pid(target_pid);
+ if (!target_pidp) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+ target->pid = target_pidp;
+
+ scheme = damon_new_scheme(
+ &(struct damos_access_pattern) {
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ .min_nr_accesses = 0,
+ .max_nr_accesses = 0,
+ .min_age_region = 50,
+ .max_age_region = UINT_MAX},
+ DAMOS_PAGEOUT,
+ 0,
+ &(struct damos_quota){},
+ &(struct damos_watermarks){},
+ NUMA_NO_NODE);
+ if (!scheme) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_set_schemes(ctx, &scheme, 1);
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+
+ repeat_call_control.data = ctx;
+ return damon_call(ctx, &repeat_call_control);
+}
+
+static void damon_sample_prcl_stop(void)
+{
+ pr_info("stop\n");
+ if (ctx) {
+ damon_stop(&ctx, 1);
+ damon_destroy_ctx(ctx);
+ }
+}
+
+static int damon_sample_prcl_enable_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (enabled == is_enabled)
+ return 0;
+
+ if (!damon_initialized())
+ return 0;
+
+ if (enabled) {
+ err = damon_sample_prcl_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_sample_prcl_stop();
+ return 0;
+}
+
+static int __init damon_sample_prcl_init(void)
+{
+ int err = 0;
+
+ if (!damon_initialized()) {
+ if (enabled)
+ enabled = false;
+ return -ENOMEM;
+ }
+
+ if (enabled) {
+ err = damon_sample_prcl_start();
+ if (err)
+ enabled = false;
+ }
+ return 0;
+}
+
+module_init(damon_sample_prcl_init);
diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c
new file mode 100644
index 000000000000..799ad4443943
--- /dev/null
+++ b/samples/damon/wsse.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * working set size estimation: monitor access pattern of given process and
+ * print estimated working set size (total size of regions that showing some
+ * access).
+ */
+
+#define pr_fmt(fmt) "damon_sample_wsse: " fmt
+
+#include <linux/damon.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_sample_wsse."
+
+static int target_pid __read_mostly;
+module_param(target_pid, int, 0600);
+
+static int damon_sample_wsse_enable_store(
+ const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_sample_wsse_enable_store,
+ .get = param_get_bool,
+};
+
+static bool enabled __read_mostly;
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled, "Enable or disable DAMON_SAMPLE_WSSE");
+
+static struct damon_ctx *ctx;
+static struct pid *target_pidp;
+
+static int damon_sample_wsse_repeat_call_fn(void *data)
+{
+ struct damon_ctx *c = data;
+ struct damon_target *t;
+
+ damon_for_each_target(t, c) {
+ struct damon_region *r;
+ unsigned long wss = 0;
+
+ damon_for_each_region(r, t) {
+ if (r->nr_accesses > 0)
+ wss += r->ar.end - r->ar.start;
+ }
+ pr_info("wss: %lu\n", wss);
+ }
+ return 0;
+}
+
+static struct damon_call_control repeat_call_control = {
+ .fn = damon_sample_wsse_repeat_call_fn,
+ .repeat = true,
+};
+
+static int damon_sample_wsse_start(void)
+{
+ struct damon_target *target;
+ int err;
+
+ pr_info("start\n");
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+ if (damon_select_ops(ctx, DAMON_OPS_VADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+ target_pidp = find_get_pid(target_pid);
+ if (!target_pidp) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+ target->pid = target_pidp;
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+ repeat_call_control.data = ctx;
+ return damon_call(ctx, &repeat_call_control);
+}
+
+static void damon_sample_wsse_stop(void)
+{
+ pr_info("stop\n");
+ if (ctx) {
+ damon_stop(&ctx, 1);
+ damon_destroy_ctx(ctx);
+ }
+}
+
+static int damon_sample_wsse_enable_store(
+ const char *val, const struct kernel_param *kp)
+{
+ bool is_enabled = enabled;
+ int err;
+
+ err = kstrtobool(val, &enabled);
+ if (err)
+ return err;
+
+ if (enabled == is_enabled)
+ return 0;
+
+ if (!damon_initialized())
+ return 0;
+
+ if (enabled) {
+ err = damon_sample_wsse_start();
+ if (err)
+ enabled = false;
+ return err;
+ }
+ damon_sample_wsse_stop();
+ return 0;
+}
+
+static int __init damon_sample_wsse_init(void)
+{
+ int err = 0;
+
+ if (!damon_initialized()) {
+ err = -ENOMEM;
+ if (enabled)
+ enabled = false;
+ }
+
+ if (enabled) {
+ err = damon_sample_wsse_start();
+ if (err)
+ enabled = false;
+ }
+ return err;
+}
+
+module_init(damon_sample_wsse_init);
diff --git a/samples/fanotify/.gitignore b/samples/fanotify/.gitignore
new file mode 100644
index 000000000000..d74593e8b2de
--- /dev/null
+++ b/samples/fanotify/.gitignore
@@ -0,0 +1 @@
+fs-monitor
diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile
new file mode 100644
index 000000000000..e20db1bdde3b
--- /dev/null
+++ b/samples/fanotify/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += fs-monitor
+
+userccflags += -I usr/include -Wall
+
diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c
new file mode 100644
index 000000000000..28c0a652ffeb
--- /dev/null
+++ b/samples/fanotify/fs-monitor.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021, Collabora Ltd.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <err.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/fanotify.h>
+#include <sys/types.h>
+#include <unistd.h>
+#ifndef __GLIBC__
+#include <asm-generic/int-ll64.h>
+#endif
+
+#ifndef FAN_FS_ERROR
+#define FAN_FS_ERROR 0x00008000
+#define FAN_EVENT_INFO_TYPE_ERROR 5
+
+struct fanotify_event_info_error {
+ struct fanotify_event_info_header hdr;
+ __s32 error;
+ __u32 error_count;
+};
+#endif
+
+#ifndef FILEID_INO32_GEN
+#define FILEID_INO32_GEN 1
+#endif
+
+#ifndef FILEID_INVALID
+#define FILEID_INVALID 0xff
+#endif
+
+static void print_fh(struct file_handle *fh)
+{
+ int i;
+ uint32_t *h = (uint32_t *) fh->f_handle;
+
+ printf("\tfh: ");
+ for (i = 0; i < fh->handle_bytes; i++)
+ printf("%hhx", fh->f_handle[i]);
+ printf("\n");
+
+ printf("\tdecoded fh: ");
+ if (fh->handle_type == FILEID_INO32_GEN)
+ printf("inode=%u gen=%u\n", h[0], h[1]);
+ else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes)
+ printf("Type %d (Superblock error)\n", fh->handle_type);
+ else
+ printf("Type %d (Unknown)\n", fh->handle_type);
+
+}
+
+static void handle_notifications(char *buffer, int len)
+{
+ struct fanotify_event_metadata *event =
+ (struct fanotify_event_metadata *) buffer;
+ struct fanotify_event_info_header *info;
+ struct fanotify_event_info_error *err;
+ struct fanotify_event_info_fid *fid;
+ int off;
+
+ for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
+
+ if (event->mask != FAN_FS_ERROR) {
+ printf("unexpected FAN MARK: %llx\n",
+ (unsigned long long)event->mask);
+ goto next_event;
+ }
+
+ if (event->fd != FAN_NOFD) {
+ printf("Unexpected fd (!= FAN_NOFD)\n");
+ goto next_event;
+ }
+
+ printf("FAN_FS_ERROR (len=%d)\n", event->event_len);
+
+ for (off = sizeof(*event) ; off < event->event_len;
+ off += info->len) {
+ info = (struct fanotify_event_info_header *)
+ ((char *) event + off);
+
+ switch (info->info_type) {
+ case FAN_EVENT_INFO_TYPE_ERROR:
+ err = (struct fanotify_event_info_error *) info;
+
+ printf("\tGeneric Error Record: len=%d\n",
+ err->hdr.len);
+ printf("\terror: %d\n", err->error);
+ printf("\terror_count: %d\n", err->error_count);
+ break;
+
+ case FAN_EVENT_INFO_TYPE_FID:
+ fid = (struct fanotify_event_info_fid *) info;
+
+ printf("\tfsid: %x%x\n",
+#if defined(__GLIBC__)
+ fid->fsid.val[0], fid->fsid.val[1]);
+#else
+ fid->fsid.__val[0], fid->fsid.__val[1]);
+#endif
+ print_fh((struct file_handle *) &fid->handle);
+ break;
+
+ default:
+ printf("\tUnknown info type=%d len=%d:\n",
+ info->info_type, info->len);
+ }
+ }
+next_event:
+ printf("---\n\n");
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int fd;
+
+ char buffer[BUFSIZ];
+
+ if (argc < 2) {
+ printf("Missing path argument\n");
+ return 1;
+ }
+
+ fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY);
+ if (fd < 0)
+ errx(1, "fanotify_init");
+
+ if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM,
+ FAN_FS_ERROR, AT_FDCWD, argv[1])) {
+ errx(1, "fanotify_mark");
+ }
+
+ while (1) {
+ int n = read(fd, buffer, BUFSIZ);
+
+ if (n < 0)
+ errx(1, "read");
+
+ handle_notifications(buffer, n);
+ }
+
+ return 0;
+}
diff --git a/samples/fprobe/Makefile b/samples/fprobe/Makefile
new file mode 100644
index 000000000000..ecccbfa6e99b
--- /dev/null
+++ b/samples/fprobe/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_FPROBE) += fprobe_example.o
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
new file mode 100644
index 000000000000..bfe98ce826f3
--- /dev/null
+++ b/samples/fprobe/fprobe_example.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Here's a sample kernel module showing the use of fprobe to dump a
+ * stack trace and selected registers when kernel_clone() is called.
+ *
+ * For more information on theory of operation of kprobes, see
+ * Documentation/trace/kprobes.rst
+ *
+ * You will see the trace data in /var/log/messages and on the console
+ * whenever kernel_clone() is invoked to create a new process.
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fprobe.h>
+#include <linux/sched/debug.h>
+#include <linux/slab.h>
+
+#define BACKTRACE_DEPTH 16
+#define MAX_SYMBOL_LEN 4096
+static struct fprobe sample_probe;
+static unsigned long nhit;
+
+static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
+module_param_string(symbol, symbol, sizeof(symbol), 0644);
+MODULE_PARM_DESC(symbol, "Probed symbol(s), given by comma separated symbols or a wildcard pattern.");
+
+static char nosymbol[MAX_SYMBOL_LEN] = "";
+module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644);
+MODULE_PARM_DESC(nosymbol, "Not-probed symbols, given by a wildcard pattern.");
+
+static bool stackdump = true;
+module_param(stackdump, bool, 0644);
+MODULE_PARM_DESC(stackdump, "Enable stackdump.");
+
+static bool use_trace = false;
+module_param(use_trace, bool, 0644);
+MODULE_PARM_DESC(use_trace, "Use trace_printk instead of printk. This is only for debugging.");
+
+static void show_backtrace(void)
+{
+ unsigned long stacks[BACKTRACE_DEPTH];
+ unsigned int len;
+
+ len = stack_trace_save(stacks, BACKTRACE_DEPTH, 2);
+ stack_trace_print(stacks, len, 24);
+}
+
+static int sample_entry_handler(struct fprobe *fp, unsigned long ip,
+ unsigned long ret_ip,
+ struct ftrace_regs *fregs, void *data)
+{
+ if (use_trace)
+ /*
+ * This is just an example, no kernel code should call
+ * trace_printk() except when actively debugging.
+ */
+ trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+ else
+ pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+ nhit++;
+ if (stackdump)
+ show_backtrace();
+ return 0;
+}
+
+static void sample_exit_handler(struct fprobe *fp, unsigned long ip,
+ unsigned long ret_ip, struct ftrace_regs *regs,
+ void *data)
+{
+ unsigned long rip = ret_ip;
+
+ if (use_trace)
+ /*
+ * This is just an example, no kernel code should call
+ * trace_printk() except when actively debugging.
+ */
+ trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
+ (void *)ip, (void *)ip, (void *)rip, (void *)rip);
+ else
+ pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
+ (void *)ip, (void *)ip, (void *)rip, (void *)rip);
+ nhit++;
+ if (stackdump)
+ show_backtrace();
+}
+
+static int __init fprobe_init(void)
+{
+ char *p, *symbuf = NULL;
+ const char **syms;
+ int ret, count, i;
+
+ sample_probe.entry_handler = sample_entry_handler;
+ sample_probe.exit_handler = sample_exit_handler;
+
+ if (strchr(symbol, '*')) {
+ /* filter based fprobe */
+ ret = register_fprobe(&sample_probe, symbol,
+ nosymbol[0] == '\0' ? NULL : nosymbol);
+ goto out;
+ } else if (!strchr(symbol, ',')) {
+ symbuf = symbol;
+ ret = register_fprobe_syms(&sample_probe, (const char **)&symbuf, 1);
+ goto out;
+ }
+
+ /* Comma separated symbols */
+ symbuf = kstrdup(symbol, GFP_KERNEL);
+ if (!symbuf)
+ return -ENOMEM;
+ p = symbuf;
+ count = 1;
+ while ((p = strchr(++p, ',')) != NULL)
+ count++;
+
+ pr_info("%d symbols found\n", count);
+
+ syms = kcalloc(count, sizeof(char *), GFP_KERNEL);
+ if (!syms) {
+ kfree(symbuf);
+ return -ENOMEM;
+ }
+
+ p = symbuf;
+ for (i = 0; i < count; i++)
+ syms[i] = strsep(&p, ",");
+
+ ret = register_fprobe_syms(&sample_probe, syms, count);
+ kfree(syms);
+ kfree(symbuf);
+out:
+ if (ret < 0)
+ pr_err("register_fprobe failed, returned %d\n", ret);
+ else
+ pr_info("Planted fprobe at %s\n", symbol);
+
+ return ret;
+}
+
+static void __exit fprobe_exit(void)
+{
+ unregister_fprobe(&sample_probe);
+
+ pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n",
+ symbol, nhit, sample_probe.nmissed);
+}
+
+module_init(fprobe_init)
+module_exit(fprobe_exit)
+MODULE_DESCRIPTION("sample kernel module showing the use of fprobe");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile
new file mode 100644
index 000000000000..589baf2ec4e3
--- /dev/null
+++ b/samples/ftrace/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o
+obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi-modify.o
+obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace-ops.o
+
+CFLAGS_sample-trace-array.o := -I$(src)
+obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c
new file mode 100644
index 000000000000..da3a9f2091f5
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
+#include <asm/asm-offsets.h>
+#endif
+
+extern void my_direct_func1(void);
+extern void my_direct_func2(void);
+
+void my_direct_func1(void)
+{
+ trace_printk("my direct func1\n");
+}
+
+void my_direct_func2(void)
+{
+ trace_printk("my direct func2\n");
+}
+
+extern void my_tramp1(void *);
+extern void my_tramp2(void *);
+
+static unsigned long my_ip = (unsigned long)schedule;
+
+#ifdef CONFIG_RISCV
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+" addi sp,sp,-2*"SZREG"\n"
+" "REG_S" t0,0*"SZREG"(sp)\n"
+" "REG_S" ra,1*"SZREG"(sp)\n"
+" call my_direct_func1\n"
+" "REG_L" t0,0*"SZREG"(sp)\n"
+" "REG_L" ra,1*"SZREG"(sp)\n"
+" addi sp,sp,2*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp1, .-my_tramp1\n"
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+
+" my_tramp2:\n"
+" addi sp,sp,-2*"SZREG"\n"
+" "REG_S" t0,0*"SZREG"(sp)\n"
+" "REG_S" ra,1*"SZREG"(sp)\n"
+" call my_direct_func2\n"
+" "REG_L" t0,0*"SZREG"(sp)\n"
+" "REG_L" ra,1*"SZREG"(sp)\n"
+" addi sp,sp,2*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
+#ifdef CONFIG_X86_64
+
+#include <asm/ibt.h>
+#include <asm/nospec-branch.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" call my_direct_func1\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" call my_direct_func2\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" brasl %r14,my_direct_func1\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp1, .-my_tramp1\n"
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" brasl %r14,my_direct_func2\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
+#ifdef CONFIG_ARM64
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+" hint 34\n" // bti c
+" sub sp, sp, #16\n"
+" stp x9, x30, [sp]\n"
+" bl my_direct_func1\n"
+" ldp x30, x9, [sp]\n"
+" add sp, sp, #16\n"
+" ret x9\n"
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+" hint 34\n" // bti c
+" sub sp, sp, #16\n"
+" stp x9, x30, [sp]\n"
+" bl my_direct_func2\n"
+" ldp x30, x9, [sp]\n"
+" add sp, sp, #16\n"
+" ret x9\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+" addi.d $sp, $sp, -16\n"
+" st.d $t0, $sp, 0\n"
+" st.d $ra, $sp, 8\n"
+" bl my_direct_func1\n"
+" ld.d $t0, $sp, 0\n"
+" ld.d $ra, $sp, 8\n"
+" addi.d $sp, $sp, 16\n"
+" jr $t0\n"
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:\n"
+" addi.d $sp, $sp, -16\n"
+" st.d $t0, $sp, 0\n"
+" st.d $ra, $sp, 8\n"
+" bl my_direct_func2\n"
+" ld.d $t0, $sp, 0\n"
+" ld.d $ra, $sp, 8\n"
+" addi.d $sp, $sp, 16\n"
+" jr $t0\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_LOONGARCH */
+
+#ifdef CONFIG_PPC
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 48
+#else
+#define STACK_FRAME_SIZE 24
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE \
+" std 2, 24(1)\n" \
+" bcl 20, 31, 1f\n" \
+" 1: mflr 12\n" \
+" ld 2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE \
+" ld 2, 24(1)\n"
+#define PPC64_TOC \
+" 99: .quad .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtlr 0\n"
+#define PPC_FTRACE_RET \
+" blr\n"
+#else
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtctr 0\n"
+#define PPC_FTRACE_RET \
+" mtlr 0\n" \
+" bctr\n"
+#endif
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+" bl my_direct_func1\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+" bl my_direct_func2\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+ PPC64_TOC
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
+static struct ftrace_ops direct;
+
+static unsigned long my_tramp = (unsigned long)my_tramp1;
+static unsigned long tramps[2] = {
+ (unsigned long)my_tramp1,
+ (unsigned long)my_tramp2,
+};
+
+static int simple_thread(void *arg)
+{
+ static int t;
+ int ret = 0;
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(2 * HZ);
+
+ if (ret)
+ continue;
+ t ^= 1;
+ ret = modify_ftrace_direct(&direct, tramps[t]);
+ if (!ret)
+ my_tramp = tramps[t];
+ WARN_ON_ONCE(ret);
+ }
+
+ return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init ftrace_direct_init(void)
+{
+ int ret;
+
+ ftrace_set_filter_ip(&direct, (unsigned long) my_ip, 0, 0);
+ ret = register_ftrace_direct(&direct, my_tramp);
+
+ if (!ret)
+ simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn");
+ return ret;
+}
+
+static void __exit ftrace_direct_exit(void)
+{
+ kthread_stop(simple_tsk);
+ unregister_ftrace_direct(&direct, my_tramp, true);
+}
+
+module_init(ftrace_direct_init);
+module_exit(ftrace_direct_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c
new file mode 100644
index 000000000000..8f7986d698d8
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-multi-modify.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
+#include <asm/asm-offsets.h>
+#endif
+
+extern void my_direct_func1(unsigned long ip);
+extern void my_direct_func2(unsigned long ip);
+
+void my_direct_func1(unsigned long ip)
+{
+ trace_printk("my direct func1 ip %lx\n", ip);
+}
+
+void my_direct_func2(unsigned long ip)
+{
+ trace_printk("my direct func2 ip %lx\n", ip);
+}
+
+extern void my_tramp1(void *);
+extern void my_tramp2(void *);
+
+#ifdef CONFIG_RISCV
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+" addi sp,sp,-3*"SZREG"\n"
+" "REG_S" a0,0*"SZREG"(sp)\n"
+" "REG_S" t0,1*"SZREG"(sp)\n"
+" "REG_S" ra,2*"SZREG"(sp)\n"
+" mv a0,t0\n"
+" call my_direct_func1\n"
+" "REG_L" a0,0*"SZREG"(sp)\n"
+" "REG_L" t0,1*"SZREG"(sp)\n"
+" "REG_L" ra,2*"SZREG"(sp)\n"
+" addi sp,sp,3*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:\n"
+" addi sp,sp,-3*"SZREG"\n"
+" "REG_S" a0,0*"SZREG"(sp)\n"
+" "REG_S" t0,1*"SZREG"(sp)\n"
+" "REG_S" ra,2*"SZREG"(sp)\n"
+" mv a0,t0\n"
+" call my_direct_func2\n"
+" "REG_L" a0,0*"SZREG"(sp)\n"
+" "REG_L" t0,1*"SZREG"(sp)\n"
+" "REG_L" ra,2*"SZREG"(sp)\n"
+" addi sp,sp,3*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
+#ifdef CONFIG_X86_64
+
+#include <asm/ibt.h>
+#include <asm/nospec-branch.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" pushq %rdi\n"
+" movq 8(%rbp), %rdi\n"
+" call my_direct_func1\n"
+" popq %rdi\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" pushq %rdi\n"
+" movq 8(%rbp), %rdi\n"
+" call my_direct_func2\n"
+" popq %rdi\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" lgr %r2,%r0\n"
+" brasl %r14,my_direct_func1\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp1, .-my_tramp1\n"
+"\n"
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" lgr %r2,%r0\n"
+" brasl %r14,my_direct_func2\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
+#ifdef CONFIG_ARM64
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:"
+" hint 34\n" // bti c
+" sub sp, sp, #32\n"
+" stp x9, x30, [sp]\n"
+" str x0, [sp, #16]\n"
+" mov x0, x30\n"
+" bl my_direct_func1\n"
+" ldp x30, x9, [sp]\n"
+" ldr x0, [sp, #16]\n"
+" add sp, sp, #32\n"
+" ret x9\n"
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:"
+" hint 34\n" // bti c
+" sub sp, sp, #32\n"
+" stp x9, x30, [sp]\n"
+" str x0, [sp, #16]\n"
+" mov x0, x30\n"
+" bl my_direct_func2\n"
+" ldp x30, x9, [sp]\n"
+" ldr x0, [sp, #16]\n"
+" add sp, sp, #32\n"
+" ret x9\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+" addi.d $sp, $sp, -32\n"
+" st.d $a0, $sp, 0\n"
+" st.d $t0, $sp, 8\n"
+" st.d $ra, $sp, 16\n"
+" move $a0, $t0\n"
+" bl my_direct_func1\n"
+" ld.d $a0, $sp, 0\n"
+" ld.d $t0, $sp, 8\n"
+" ld.d $ra, $sp, 16\n"
+" addi.d $sp, $sp, 32\n"
+" jr $t0\n"
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:\n"
+" addi.d $sp, $sp, -32\n"
+" st.d $a0, $sp, 0\n"
+" st.d $t0, $sp, 8\n"
+" st.d $ra, $sp, 16\n"
+" move $a0, $t0\n"
+" bl my_direct_func2\n"
+" ld.d $a0, $sp, 0\n"
+" ld.d $t0, $sp, 8\n"
+" ld.d $ra, $sp, 16\n"
+" addi.d $sp, $sp, 32\n"
+" jr $t0\n"
+" .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_LOONGARCH */
+
+#ifdef CONFIG_PPC
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 48
+#else
+#define STACK_FRAME_SIZE 24
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE \
+" std 2, 24(1)\n" \
+" bcl 20, 31, 1f\n" \
+" 1: mflr 12\n" \
+" ld 2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE \
+" ld 2, 24(1)\n"
+#define PPC64_TOC \
+" 99: .quad .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtlr 0\n"
+#define PPC_FTRACE_RET \
+" blr\n"
+#define PPC_FTRACE_RECOVER_IP \
+" lwz 8, 4(3)\n" \
+" li 9, 6\n" \
+" slw 8, 8, 9\n" \
+" sraw 8, 8, 9\n" \
+" add 3, 3, 8\n" \
+" addi 3, 3, 4\n"
+#else
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtctr 0\n"
+#define PPC_FTRACE_RET \
+" mtlr 0\n" \
+" bctr\n"
+#define PPC_FTRACE_RECOVER_IP ""
+#endif
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp1, @function\n"
+" .globl my_tramp1\n"
+" my_tramp1:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+ PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mr 3, 0\n"
+ PPC_FTRACE_RECOVER_IP
+" bl my_direct_func1\n"
+ PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+" .size my_tramp1, .-my_tramp1\n"
+
+" .type my_tramp2, @function\n"
+" .globl my_tramp2\n"
+" my_tramp2:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+ PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mr 3, 0\n"
+ PPC_FTRACE_RECOVER_IP
+" bl my_direct_func2\n"
+ PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+ PPC64_TOC
+ " .size my_tramp2, .-my_tramp2\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
+static unsigned long my_tramp = (unsigned long)my_tramp1;
+static unsigned long tramps[2] = {
+ (unsigned long)my_tramp1,
+ (unsigned long)my_tramp2,
+};
+
+static struct ftrace_ops direct;
+
+static int simple_thread(void *arg)
+{
+ static int t;
+ int ret = 0;
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(2 * HZ);
+
+ if (ret)
+ continue;
+ t ^= 1;
+ ret = modify_ftrace_direct(&direct, tramps[t]);
+ if (!ret)
+ my_tramp = tramps[t];
+ WARN_ON_ONCE(ret);
+ }
+
+ return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init ftrace_direct_multi_init(void)
+{
+ int ret;
+
+ ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
+ ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0);
+
+ ret = register_ftrace_direct(&direct, my_tramp);
+
+ if (!ret)
+ simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn");
+ return ret;
+}
+
+static void __exit ftrace_direct_multi_exit(void)
+{
+ kthread_stop(simple_tsk);
+ unregister_ftrace_direct(&direct, my_tramp, true);
+}
+
+module_init(ftrace_direct_multi_init);
+module_exit(ftrace_direct_multi_exit);
+
+MODULE_AUTHOR("Jiri Olsa");
+MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c
new file mode 100644
index 000000000000..db326c81a27d
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-multi.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <linux/mm.h> /* for handle_mm_fault() */
+#include <linux/ftrace.h>
+#include <linux/sched/stat.h>
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
+#include <asm/asm-offsets.h>
+#endif
+
+extern void my_direct_func(unsigned long ip);
+
+void my_direct_func(unsigned long ip)
+{
+ trace_printk("ip %lx\n", ip);
+}
+
+extern void my_tramp(void *);
+
+#ifdef CONFIG_RISCV
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi sp,sp,-3*"SZREG"\n"
+" "REG_S" a0,0*"SZREG"(sp)\n"
+" "REG_S" t0,1*"SZREG"(sp)\n"
+" "REG_S" ra,2*"SZREG"(sp)\n"
+" mv a0,t0\n"
+" call my_direct_func\n"
+" "REG_L" a0,0*"SZREG"(sp)\n"
+" "REG_L" t0,1*"SZREG"(sp)\n"
+" "REG_L" ra,2*"SZREG"(sp)\n"
+" addi sp,sp,3*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
+#ifdef CONFIG_X86_64
+
+#include <asm/ibt.h>
+#include <asm/nospec-branch.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" pushq %rdi\n"
+" movq 8(%rbp), %rdi\n"
+" call my_direct_func\n"
+" popq %rdi\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" lgr %r2,%r0\n"
+" brasl %r14,my_direct_func\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
+#ifdef CONFIG_ARM64
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" hint 34\n" // bti c
+" sub sp, sp, #32\n"
+" stp x9, x30, [sp]\n"
+" str x0, [sp, #16]\n"
+" mov x0, x30\n"
+" bl my_direct_func\n"
+" ldp x30, x9, [sp]\n"
+" ldr x0, [sp, #16]\n"
+" add sp, sp, #32\n"
+" ret x9\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+
+#include <asm/asm.h>
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi.d $sp, $sp, -32\n"
+" st.d $a0, $sp, 0\n"
+" st.d $t0, $sp, 8\n"
+" st.d $ra, $sp, 16\n"
+" move $a0, $t0\n"
+" bl my_direct_func\n"
+" ld.d $a0, $sp, 0\n"
+" ld.d $t0, $sp, 8\n"
+" ld.d $ra, $sp, 16\n"
+" addi.d $sp, $sp, 32\n"
+" jr $t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_LOONGARCH */
+
+#ifdef CONFIG_PPC
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 48
+#else
+#define STACK_FRAME_SIZE 24
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE \
+" std 2, 24(1)\n" \
+" bcl 20, 31, 1f\n" \
+" 1: mflr 12\n" \
+" ld 2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE \
+" ld 2, 24(1)\n"
+#define PPC64_TOC \
+" 99: .quad .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtlr 0\n"
+#define PPC_FTRACE_RET \
+" blr\n"
+#define PPC_FTRACE_RECOVER_IP \
+" lwz 8, 4(3)\n" \
+" li 9, 6\n" \
+" slw 8, 8, 9\n" \
+" sraw 8, 8, 9\n" \
+" add 3, 3, 8\n" \
+" addi 3, 3, 4\n"
+#else
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtctr 0\n"
+#define PPC_FTRACE_RET \
+" mtlr 0\n" \
+" bctr\n"
+#define PPC_FTRACE_RECOVER_IP ""
+#endif
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+ PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mr 3, 0\n"
+ PPC_FTRACE_RECOVER_IP
+" bl my_direct_func\n"
+ PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+ PPC64_TOC
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
+static struct ftrace_ops direct;
+
+static int __init ftrace_direct_multi_init(void)
+{
+ ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
+ ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0);
+
+ return register_ftrace_direct(&direct, (unsigned long) my_tramp);
+}
+
+static void __exit ftrace_direct_multi_exit(void)
+{
+ unregister_ftrace_direct(&direct, (unsigned long) my_tramp, true);
+}
+
+module_init(ftrace_direct_multi_init);
+module_exit(ftrace_direct_multi_exit);
+
+MODULE_AUTHOR("Jiri Olsa");
+MODULE_DESCRIPTION("Example use case of using register_ftrace_direct_multi()");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
new file mode 100644
index 000000000000..3d0fa260332d
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <linux/mm.h> /* for handle_mm_fault() */
+#include <linux/ftrace.h>
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
+#include <asm/asm-offsets.h>
+#endif
+
+extern void my_direct_func(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, struct pt_regs *regs);
+
+void my_direct_func(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, struct pt_regs *regs)
+{
+ trace_printk("handle mm fault vma=%p address=%lx flags=%x regs=%p\n",
+ vma, address, flags, regs);
+}
+
+extern void my_tramp(void *);
+
+#ifdef CONFIG_RISCV
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi sp,sp,-5*"SZREG"\n"
+" "REG_S" a0,0*"SZREG"(sp)\n"
+" "REG_S" a1,1*"SZREG"(sp)\n"
+" "REG_S" a2,2*"SZREG"(sp)\n"
+" "REG_S" t0,3*"SZREG"(sp)\n"
+" "REG_S" ra,4*"SZREG"(sp)\n"
+" call my_direct_func\n"
+" "REG_L" a0,0*"SZREG"(sp)\n"
+" "REG_L" a1,1*"SZREG"(sp)\n"
+" "REG_L" a2,2*"SZREG"(sp)\n"
+" "REG_L" t0,3*"SZREG"(sp)\n"
+" "REG_L" ra,4*"SZREG"(sp)\n"
+" addi sp,sp,5*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
+#ifdef CONFIG_X86_64
+
+#include <asm/ibt.h>
+#include <asm/nospec-branch.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" pushq %rdi\n"
+" pushq %rsi\n"
+" pushq %rdx\n"
+" pushq %rcx\n"
+" call my_direct_func\n"
+" popq %rcx\n"
+" popq %rdx\n"
+" popq %rsi\n"
+" popq %rdi\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" brasl %r14,my_direct_func\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
+#ifdef CONFIG_ARM64
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" hint 34\n" // bti c
+" sub sp, sp, #48\n"
+" stp x9, x30, [sp]\n"
+" stp x0, x1, [sp, #16]\n"
+" stp x2, x3, [sp, #32]\n"
+" bl my_direct_func\n"
+" ldp x30, x9, [sp]\n"
+" ldp x0, x1, [sp, #16]\n"
+" ldp x2, x3, [sp, #32]\n"
+" add sp, sp, #48\n"
+" ret x9\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi.d $sp, $sp, -48\n"
+" st.d $a0, $sp, 0\n"
+" st.d $a1, $sp, 8\n"
+" st.d $a2, $sp, 16\n"
+" st.d $t0, $sp, 24\n"
+" st.d $ra, $sp, 32\n"
+" bl my_direct_func\n"
+" ld.d $a0, $sp, 0\n"
+" ld.d $a1, $sp, 8\n"
+" ld.d $a2, $sp, 16\n"
+" ld.d $t0, $sp, 24\n"
+" ld.d $ra, $sp, 32\n"
+" addi.d $sp, $sp, 48\n"
+" jr $t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_LOONGARCH */
+
+#ifdef CONFIG_PPC
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 64
+#define STACK_FRAME_ARG1 32
+#define STACK_FRAME_ARG2 40
+#define STACK_FRAME_ARG3 48
+#define STACK_FRAME_ARG4 56
+#else
+#define STACK_FRAME_SIZE 32
+#define STACK_FRAME_ARG1 16
+#define STACK_FRAME_ARG2 20
+#define STACK_FRAME_ARG3 24
+#define STACK_FRAME_ARG4 28
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE \
+" std 2, 24(1)\n" \
+" bcl 20, 31, 1f\n" \
+" 1: mflr 12\n" \
+" ld 2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE \
+" ld 2, 24(1)\n"
+#define PPC64_TOC \
+" 99: .quad .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtlr 0\n"
+#define PPC_FTRACE_RET \
+" blr\n"
+#else
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtctr 0\n"
+#define PPC_FTRACE_RET \
+" mtlr 0\n" \
+" bctr\n"
+#endif
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+ PPC_STL" 3, "__stringify(STACK_FRAME_ARG1)"(1)\n"
+ PPC_STL" 4, "__stringify(STACK_FRAME_ARG2)"(1)\n"
+ PPC_STL" 5, "__stringify(STACK_FRAME_ARG3)"(1)\n"
+ PPC_STL" 6, "__stringify(STACK_FRAME_ARG4)"(1)\n"
+" bl my_direct_func\n"
+ PPC_LL" 6, "__stringify(STACK_FRAME_ARG4)"(1)\n"
+ PPC_LL" 5, "__stringify(STACK_FRAME_ARG3)"(1)\n"
+ PPC_LL" 4, "__stringify(STACK_FRAME_ARG2)"(1)\n"
+ PPC_LL" 3, "__stringify(STACK_FRAME_ARG1)"(1)\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+ PPC64_TOC
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
+static struct ftrace_ops direct;
+
+static int __init ftrace_direct_init(void)
+{
+ ftrace_set_filter_ip(&direct, (unsigned long) handle_mm_fault, 0, 0);
+
+ return register_ftrace_direct(&direct, (unsigned long) my_tramp);
+}
+
+static void __exit ftrace_direct_exit(void)
+{
+ unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true);
+}
+
+module_init(ftrace_direct_init);
+module_exit(ftrace_direct_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c
new file mode 100644
index 000000000000..956834b0d19a
--- /dev/null
+++ b/samples/ftrace/ftrace-direct.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <linux/sched.h> /* for wake_up_process() */
+#include <linux/ftrace.h>
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
+#include <asm/asm-offsets.h>
+#endif
+
+extern void my_direct_func(struct task_struct *p);
+
+void my_direct_func(struct task_struct *p)
+{
+ trace_printk("waking up %s-%d\n", p->comm, p->pid);
+}
+
+extern void my_tramp(void *);
+
+#ifdef CONFIG_RISCV
+#include <asm/asm.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi sp,sp,-3*"SZREG"\n"
+" "REG_S" a0,0*"SZREG"(sp)\n"
+" "REG_S" t0,1*"SZREG"(sp)\n"
+" "REG_S" ra,2*"SZREG"(sp)\n"
+" call my_direct_func\n"
+" "REG_L" a0,0*"SZREG"(sp)\n"
+" "REG_L" t0,1*"SZREG"(sp)\n"
+" "REG_L" ra,2*"SZREG"(sp)\n"
+" addi sp,sp,3*"SZREG"\n"
+" jr t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
+#ifdef CONFIG_X86_64
+
+#include <asm/ibt.h>
+#include <asm/nospec-branch.h>
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+ ASM_ENDBR
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+ CALL_DEPTH_ACCOUNT
+" pushq %rdi\n"
+" call my_direct_func\n"
+" popq %rdi\n"
+" leave\n"
+ ASM_RET
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" lgr %r1,%r15\n"
+" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+" brasl %r14,my_direct_func\n"
+" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+" lgr %r1,%r0\n"
+" br %r1\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
+#ifdef CONFIG_ARM64
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:"
+" hint 34\n" // bti c
+" sub sp, sp, #32\n"
+" stp x9, x30, [sp]\n"
+" str x0, [sp, #16]\n"
+" bl my_direct_func\n"
+" ldp x30, x9, [sp]\n"
+" ldr x0, [sp, #16]\n"
+" add sp, sp, #32\n"
+" ret x9\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+" addi.d $sp, $sp, -32\n"
+" st.d $a0, $sp, 0\n"
+" st.d $t0, $sp, 8\n"
+" st.d $ra, $sp, 16\n"
+" bl my_direct_func\n"
+" ld.d $a0, $sp, 0\n"
+" ld.d $t0, $sp, 8\n"
+" ld.d $ra, $sp, 16\n"
+" addi.d $sp, $sp, 32\n"
+" jr $t0\n"
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_LOONGARCH */
+
+#ifdef CONFIG_PPC
+#include <asm/ppc_asm.h>
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 48
+#else
+#define STACK_FRAME_SIZE 24
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE \
+" std 2, 24(1)\n" \
+" bcl 20, 31, 1f\n" \
+" 1: mflr 12\n" \
+" ld 2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE \
+" ld 2, 24(1)\n"
+#define PPC64_TOC \
+" 99: .quad .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtlr 0\n"
+#define PPC_FTRACE_RET \
+" blr\n"
+#else
+#define PPC_FTRACE_RESTORE_LR \
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \
+" mtctr 0\n"
+#define PPC_FTRACE_RET \
+" mtlr 0\n" \
+" bctr\n"
+#endif
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" .type my_tramp, @function\n"
+" .globl my_tramp\n"
+" my_tramp:\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" mflr 0\n"
+ PPC_STL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+ PPC64_TOC_SAVE_AND_UPDATE
+ PPC_STL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+" bl my_direct_func\n"
+ PPC_LL" 3, "__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+ PPC64_TOC_RESTORE
+" addi 1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+ PPC_FTRACE_RESTORE_LR
+" addi 1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+ PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+ PPC_FTRACE_RET
+ PPC64_TOC
+" .size my_tramp, .-my_tramp\n"
+" .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
+static struct ftrace_ops direct;
+
+static int __init ftrace_direct_init(void)
+{
+ ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0);
+
+ return register_ftrace_direct(&direct, (unsigned long) my_tramp);
+}
+
+static void __exit ftrace_direct_exit(void)
+{
+ unregister_ftrace_direct(&direct, (unsigned long)my_tramp, true);
+}
+
+module_init(ftrace_direct_init);
+module_exit(ftrace_direct_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c
new file mode 100644
index 000000000000..68d6685c80bd
--- /dev/null
+++ b/samples/ftrace/ftrace-ops.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/ftrace.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+#include <asm/barrier.h>
+
+/*
+ * Arbitrary large value chosen to be sufficiently large to minimize noise but
+ * sufficiently small to complete quickly.
+ */
+static unsigned int nr_function_calls = 100000;
+module_param(nr_function_calls, uint, 0);
+MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee");
+
+/*
+ * The number of ops associated with a call site affects whether a tracer can
+ * be called directly or whether it's necessary to go via the list func, which
+ * can be significantly more expensive.
+ */
+static unsigned int nr_ops_relevant = 1;
+module_param(nr_ops_relevant, uint, 0);
+MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the relevant tracee");
+
+/*
+ * On architectures where all call sites share the same trampoline, having
+ * tracers enabled for distinct functions can force the use of the list func
+ * and incur overhead for all call sites.
+ */
+static unsigned int nr_ops_irrelevant;
+module_param(nr_ops_irrelevant, uint, 0);
+MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the irrelevant tracee");
+
+/*
+ * On architectures with DYNAMIC_FTRACE_WITH_REGS, saving the full pt_regs can
+ * be more expensive than only saving the minimal necessary regs.
+ */
+static bool save_regs;
+module_param(save_regs, bool, 0);
+MODULE_PARM_DESC(save_regs, "Register ops with FTRACE_OPS_FL_SAVE_REGS (save all registers in the trampoline)");
+
+static bool assist_recursion;
+module_param(assist_recursion, bool, 0);
+MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RECURSION");
+
+static bool assist_rcu;
+module_param(assist_rcu, bool, 0);
+MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU");
+
+/*
+ * By default, a trivial tracer is used which immediately returns to mimimize
+ * overhead. Sometimes a consistency check using a more expensive tracer is
+ * desireable.
+ */
+static bool check_count;
+module_param(check_count, bool, 0);
+MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number of times\n");
+
+/*
+ * Usually it's not interesting to leave the ops registered after the test
+ * runs, but sometimes it can be useful to leave them registered so that they
+ * can be inspected through the tracefs 'enabled_functions' file.
+ */
+static bool persist;
+module_param(persist, bool, 0);
+MODULE_PARM_DESC(persist, "Successfully load module and leave ftrace ops registered after test completes\n");
+
+/*
+ * Marked as noinline to ensure that an out-of-line traceable copy is
+ * generated by the compiler.
+ *
+ * The barrier() ensures the compiler won't elide calls by determining there
+ * are no side-effects.
+ */
+static noinline void tracee_relevant(void)
+{
+ barrier();
+}
+
+/*
+ * Marked as noinline to ensure that an out-of-line traceable copy is
+ * generated by the compiler.
+ *
+ * The barrier() ensures the compiler won't elide calls by determining there
+ * are no side-effects.
+ */
+static noinline void tracee_irrelevant(void)
+{
+ barrier();
+}
+
+struct sample_ops {
+ struct ftrace_ops ops;
+ unsigned int count;
+};
+
+static void ops_func_nop(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ /* do nothing */
+}
+
+static void ops_func_count(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ struct sample_ops *self;
+
+ self = container_of(op, struct sample_ops, ops);
+ self->count++;
+}
+
+static struct sample_ops *ops_relevant;
+static struct sample_ops *ops_irrelevant;
+
+static struct sample_ops *ops_alloc_init(void *tracee, ftrace_func_t func,
+ unsigned long flags, int nr)
+{
+ struct sample_ops *ops;
+
+ ops = kcalloc(nr, sizeof(*ops), GFP_KERNEL);
+ if (WARN_ON_ONCE(!ops))
+ return NULL;
+
+ for (unsigned int i = 0; i < nr; i++) {
+ ops[i].ops.func = func;
+ ops[i].ops.flags = flags;
+ WARN_ON_ONCE(ftrace_set_filter_ip(&ops[i].ops, (unsigned long)tracee, 0, 0));
+ WARN_ON_ONCE(register_ftrace_function(&ops[i].ops));
+ }
+
+ return ops;
+}
+
+static void ops_destroy(struct sample_ops *ops, int nr)
+{
+ if (!ops)
+ return;
+
+ for (unsigned int i = 0; i < nr; i++) {
+ WARN_ON_ONCE(unregister_ftrace_function(&ops[i].ops));
+ ftrace_free_filter(&ops[i].ops);
+ }
+
+ kfree(ops);
+}
+
+static void ops_check(struct sample_ops *ops, int nr,
+ unsigned int expected_count)
+{
+ if (!ops || !check_count)
+ return;
+
+ for (unsigned int i = 0; i < nr; i++) {
+ if (ops->count == expected_count)
+ continue;
+ pr_warn("Counter called %u times (expected %u)\n",
+ ops->count, expected_count);
+ }
+}
+
+static ftrace_func_t tracer_relevant = ops_func_nop;
+static ftrace_func_t tracer_irrelevant = ops_func_nop;
+
+static int __init ftrace_ops_sample_init(void)
+{
+ unsigned long flags = 0;
+ ktime_t start, end;
+ u64 period;
+
+ if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && save_regs) {
+ pr_info("this kernel does not support saving registers\n");
+ save_regs = false;
+ } else if (save_regs) {
+ flags |= FTRACE_OPS_FL_SAVE_REGS;
+ }
+
+ if (assist_recursion)
+ flags |= FTRACE_OPS_FL_RECURSION;
+
+ if (assist_rcu)
+ flags |= FTRACE_OPS_FL_RCU;
+
+ if (check_count) {
+ tracer_relevant = ops_func_count;
+ tracer_irrelevant = ops_func_count;
+ }
+
+ pr_info("registering:\n"
+ " relevant ops: %u\n"
+ " tracee: %ps\n"
+ " tracer: %ps\n"
+ " irrelevant ops: %u\n"
+ " tracee: %ps\n"
+ " tracer: %ps\n"
+ " saving registers: %s\n"
+ " assist recursion: %s\n"
+ " assist RCU: %s\n",
+ nr_ops_relevant, tracee_relevant, tracer_relevant,
+ nr_ops_irrelevant, tracee_irrelevant, tracer_irrelevant,
+ save_regs ? "YES" : "NO",
+ assist_recursion ? "YES" : "NO",
+ assist_rcu ? "YES" : "NO");
+
+ ops_relevant = ops_alloc_init(tracee_relevant, tracer_relevant,
+ flags, nr_ops_relevant);
+ ops_irrelevant = ops_alloc_init(tracee_irrelevant, tracer_irrelevant,
+ flags, nr_ops_irrelevant);
+
+ start = ktime_get();
+ for (unsigned int i = 0; i < nr_function_calls; i++)
+ tracee_relevant();
+ end = ktime_get();
+
+ ops_check(ops_relevant, nr_ops_relevant, nr_function_calls);
+ ops_check(ops_irrelevant, nr_ops_irrelevant, 0);
+
+ period = ktime_to_ns(ktime_sub(end, start));
+
+ pr_info("Attempted %u calls to %ps in %lluns (%lluns / call)\n",
+ nr_function_calls, tracee_relevant,
+ period, div_u64(period, nr_function_calls));
+
+ if (persist)
+ return 0;
+
+ ops_destroy(ops_relevant, nr_ops_relevant);
+ ops_destroy(ops_irrelevant, nr_ops_irrelevant);
+
+ /*
+ * The benchmark completed sucessfully, but there's no reason to keep
+ * the module around. Return an error do the user doesn't have to
+ * manually unload the module.
+ */
+ return -EINVAL;
+}
+module_init(ftrace_ops_sample_init);
+
+static void __exit ftrace_ops_sample_exit(void)
+{
+ ops_destroy(ops_relevant, nr_ops_relevant);
+ ops_destroy(ops_irrelevant, nr_ops_irrelevant);
+}
+module_exit(ftrace_ops_sample_exit);
+
+MODULE_AUTHOR("Mark Rutland");
+MODULE_DESCRIPTION("Example of using custom ftrace_ops");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c
new file mode 100644
index 000000000000..4147616102f9
--- /dev/null
+++ b/samples/ftrace/sample-trace-array.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace.h>
+#include <linux/trace_events.h>
+#include <linux/timer.h>
+#include <linux/err.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first. This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "sample-trace-array.h"
+
+struct trace_array *tr;
+static void mytimer_handler(struct timer_list *unused);
+static struct task_struct *simple_tsk;
+
+static void trace_work_fn(struct work_struct *work)
+{
+ /*
+ * Disable tracing for event "sample_event".
+ */
+ trace_array_set_clr_event(tr, "sample-subsystem", "sample_event",
+ false);
+}
+static DECLARE_WORK(trace_work, trace_work_fn);
+
+/*
+ * mytimer: Timer setup to disable tracing for event "sample_event". This
+ * timer is only for the purposes of the sample module to demonstrate access of
+ * Ftrace instances from within kernel.
+ */
+static DEFINE_TIMER(mytimer, mytimer_handler);
+
+static void mytimer_handler(struct timer_list *unused)
+{
+ schedule_work(&trace_work);
+}
+
+static void simple_thread_func(int count)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+
+ /*
+ * Printing count value using trace_array_printk() - trace_printk()
+ * equivalent for the instance buffers.
+ */
+ trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n",
+ count);
+ /*
+ * Tracepoint for event "sample_event". This will print the
+ * current value of count and current jiffies.
+ */
+ trace_sample_event(count, jiffies);
+}
+
+static int simple_thread(void *arg)
+{
+ int count = 0;
+ unsigned long delay = msecs_to_jiffies(5000);
+
+ /*
+ * Enable tracing for "sample_event".
+ */
+ trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true);
+
+ /*
+ * Adding timer - mytimer. This timer will disable tracing after
+ * delay seconds.
+ *
+ */
+ add_timer(&mytimer);
+ mod_timer(&mytimer, jiffies+delay);
+
+ while (!kthread_should_stop())
+ simple_thread_func(count++);
+
+ timer_delete(&mytimer);
+ cancel_work_sync(&trace_work);
+
+ /*
+ * trace_array_put() decrements the reference counter associated with
+ * the trace array - "tr". We are done using the trace array, hence
+ * decrement the reference counter so that it can be destroyed using
+ * trace_array_destroy().
+ */
+ trace_array_put(tr);
+
+ return 0;
+}
+
+static int __init sample_trace_array_init(void)
+{
+ /*
+ * Return a pointer to the trace array with name "sample-instance" if it
+ * exists, else create a new trace array.
+ *
+ * NOTE: This function increments the reference counter
+ * associated with the trace array - "tr".
+ */
+ tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes");
+
+ if (!tr)
+ return -1;
+ /*
+ * If context specific per-cpu buffers havent already been allocated.
+ */
+ trace_array_init_printk(tr);
+
+ simple_tsk = kthread_run(simple_thread, NULL, "sample-instance");
+ if (IS_ERR(simple_tsk)) {
+ trace_array_put(tr);
+ trace_array_destroy(tr);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void __exit sample_trace_array_exit(void)
+{
+ kthread_stop(simple_tsk);
+
+ /*
+ * We are unloading our module and no longer require the trace array.
+ * Remove/destroy "tr" using trace_array_destroy()
+ */
+ trace_array_destroy(tr);
+}
+
+module_init(sample_trace_array_init);
+module_exit(sample_trace_array_exit);
+
+MODULE_AUTHOR("Divya Indi");
+MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances");
+MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h
new file mode 100644
index 000000000000..6f8962428158
--- /dev/null
+++ b/samples/ftrace/sample-trace-array.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /sys/kernel/tracing/events/<system>
+ *
+ * The define_trace.h below will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ * In this case, it would look for sample-trace.h
+ *
+ * If the header name will be different than the system name
+ * (as in this case), then you can override the header name that
+ * define_trace.h will look up by defining TRACE_INCLUDE_FILE
+ *
+ * This file is called sample-trace-array.h but we want the system
+ * to be called "sample-subsystem". Therefore we must define the name of this
+ * file:
+ *
+ * #define TRACE_INCLUDE_FILE sample-trace-array
+ *
+ * As we do in the bottom of this file.
+ *
+ * Notice that TRACE_SYSTEM should be defined outside of #if
+ * protection, just like TRACE_INCLUDE_FILE.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sample-subsystem
+
+/*
+ * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
+ * and underscore), although it may start with numbers. If for some
+ * reason it is not, you need to add the following lines:
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR sample_subsystem
+
+/*
+ * But the above is only needed if TRACE_SYSTEM is not alpha-numeric
+ * and underscored. By default, TRACE_SYSTEM_VAR will be equal to
+ * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
+ * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
+ * only alpha-numeric and underscores.
+ *
+ * The TRACE_SYSTEM_VAR is only used internally and not visible to
+ * user space.
+ */
+
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ * || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _SAMPLE_TRACE_ARRAY_H
+
+#include <linux/tracepoint.h>
+TRACE_EVENT(sample_event,
+
+ TP_PROTO(int count, unsigned long time),
+
+ TP_ARGS(count, time),
+
+ TP_STRUCT__entry(
+ __field(int, count)
+ __field(unsigned long, time)
+ ),
+
+ TP_fast_assign(
+ __entry->count = count;
+ __entry->time = time;
+ ),
+
+ TP_printk("count value=%d at jiffies=%lu", __entry->count,
+ __entry->time)
+ );
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE sample-trace-array
+#include <trace/define_trace.h>
diff --git a/samples/hid/.gitignore b/samples/hid/.gitignore
new file mode 100644
index 000000000000..3ea0fed3bbad
--- /dev/null
+++ b/samples/hid/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hid_mouse
+hid_surface_dial
+*.out
+*.skel.h
+/vmlinux.h
+/bpftool/
+/libbpf/
diff --git a/samples/hid/Makefile b/samples/hid/Makefile
new file mode 100644
index 000000000000..db5a077c77fc
--- /dev/null
+++ b/samples/hid/Makefile
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: GPL-2.0
+
+HID_SAMPLES_PATH ?= $(abspath $(src))
+TOOLS_PATH := $(HID_SAMPLES_PATH)/../../tools
+
+pound := \#
+
+# List of programs to build
+tprogs-y += hid_mouse
+tprogs-y += hid_surface_dial
+
+# Libbpf dependencies
+LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT = $(abspath $(HID_SAMPLES_PATH))/libbpf
+LIBBPF_DESTDIR = $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
+LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a
+
+EXTRA_BPF_HEADERS := hid_bpf_helpers.h
+
+hid_mouse-objs := hid_mouse.o
+hid_surface_dial-objs := hid_surface_dial.o
+
+# Tell kbuild to always build the programs
+always-y := $(tprogs-y)
+
+ifeq ($(ARCH), arm)
+# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
+# headers when arm instruction set identification is requested.
+ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS))
+BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR)
+TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR)
+endif
+
+ifeq ($(ARCH), mips)
+TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__
+ifdef CONFIG_MACH_LOONGSON64
+BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-loongson64
+BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic
+endif
+endif
+
+COMMON_CFLAGS += -Wall -O2
+COMMON_CFLAGS += -Wmissing-prototypes
+COMMON_CFLAGS += -Wstrict-prototypes
+
+TPROGS_CFLAGS += $(COMMON_CFLAGS)
+TPROGS_CFLAGS += -I$(objtree)/usr/include
+TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE)
+TPROGS_CFLAGS += -I$(srctree)/tools/include
+
+ifdef SYSROOT
+COMMON_CFLAGS += --sysroot=$(SYSROOT)
+TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib
+endif
+
+TPROGS_LDLIBS += $(LIBBPF) -lelf -lz
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
+# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+OPT ?= opt
+LLVM_DIS ?= llvm-dis
+LLVM_OBJCOPY ?= llvm-objcopy
+LLVM_READELF ?= llvm-readelf
+BTF_PAHOLE ?= pahole
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+# Don't evaluate probes and warnings if we need to run make recursively
+ifneq ($(src),)
+HDR_PROBE := $(shell printf "$(pound)include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \
+ $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \
+ -o /dev/null 2>/dev/null && echo okay)
+
+ifeq ($(HDR_PROBE),)
+$(warning WARNING: Detected possible issues with include path.)
+$(warning WARNING: Please install kernel headers locally (make headers_install).)
+endif
+
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
+ $(CLANG) --target=bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \
+ $(LLVM_READELF) -S ./llvm_btf_verify.o | grep BTF; \
+ /bin/rm -f ./llvm_btf_verify.o)
+
+BPF_EXTRA_CFLAGS += -fno-stack-protector
+ifneq ($(BTF_LLVM_PROBE),)
+ BPF_EXTRA_CFLAGS += -g
+else
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+ BPF_EXTRA_CFLAGS += -g
+ LLC_FLAGS += -mattr=dwarfris
+ DWARF2BTF = y
+endif
+endif
+endif
+
+# Trick to allow make to be run from this directory
+all:
+ $(MAKE) -C ../../ M=$(CURDIR) HID_SAMPLES_PATH=$(CURDIR)
+
+clean:
+ $(MAKE) -C ../../ M=$(CURDIR) clean
+ @find $(CURDIR) -type f -name '*~' -delete
+ @$(RM) -r $(CURDIR)/libbpf $(CURDIR)/bpftool
+
+$(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \
+ LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(HID_SAMPLES_PATH)/../../ \
+ O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \
+ $@ install_headers
+
+BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abspath $(HID_SAMPLES_PATH))/bpftool
+BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+$(BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) | $(BPFTOOL_OUTPUT)
+ $(MAKE) -C $(BPFTOOLDIR) srctree=$(HID_SAMPLES_PATH)/../../ \
+ OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
+
+$(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $@
+
+FORCE:
+
+
+# Verify LLVM compiler tools are available and bpf target is supported by llc
+.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
+
+verify_cmds: $(CLANG) $(LLC)
+ @for TOOL in $^ ; do \
+ if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
+ echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
+ exit 1; \
+ else true; fi; \
+ done
+
+verify_target_bpf: verify_cmds
+ @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
+ echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
+ echo " NOTICE: LLVM version >= 3.7.1 required" ;\
+ exit 2; \
+ else true; fi
+
+$(HID_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
+$(src)/*.c: verify_target_bpf $(LIBBPF)
+
+libbpf_hdrs: $(LIBBPF)
+
+.PHONY: libbpf_hdrs
+
+$(obj)/hid_mouse.o: $(obj)/hid_mouse.skel.h
+$(obj)/hid_surface_dial.o: $(obj)/hid_surface_dial.skel.h
+
+-include $(HID_SAMPLES_PATH)/Makefile.target
+
+VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \
+ $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \
+ $(abspath $(objtree)/vmlinux)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+
+$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
+ifeq ($(VMLINUX_H),)
+ifeq ($(VMLINUX_BTF),)
+ $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\
+ build the kernel or set VMLINUX_BTF or VMLINUX_H variable)
+endif
+ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+ $(Q)cp "$(VMLINUX_H)" $@
+endif
+
+clean-files += vmlinux.h
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '--target=bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/')
+endef
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
+
+EXTRA_BPF_HEADERS_SRC := $(addprefix $(src)/,$(EXTRA_BPF_HEADERS))
+
+$(obj)/%.bpf.o: $(src)/%.bpf.c $(EXTRA_BPF_HEADERS_SRC) $(obj)/vmlinux.h
+ @echo " CLANG-BPF " $@
+ $(Q)$(CLANG) -g -O2 --target=bpf -D__TARGET_ARCH_$(SRCARCH) \
+ -Wno-compare-distinct-pointer-types -I$(srctree)/include \
+ -I$(srctree)/samples/bpf -I$(srctree)/tools/include \
+ -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \
+ -c $(filter %.bpf.c,$^) -o $@
+
+LINKED_SKELS := hid_mouse.skel.h hid_surface_dial.skel.h
+clean-files += $(LINKED_SKELS)
+
+hid_mouse.skel.h-deps := hid_mouse.bpf.o
+hid_surface_dial.skel.h-deps := hid_surface_dial.bpf.o
+
+LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
+
+BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c))
+BPF_OBJS_LINKED := $(patsubst %.bpf.c,$(obj)/%.bpf.o, $(BPF_SRCS_LINKED))
+BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS))
+
+$(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL)
+ @echo " BPF GEN-OBJ " $(@:.skel.h=)
+ $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps))
+ @echo " BPF GEN-SKEL" $(@:.skel.h=)
+ $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@
+
+# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
+# But, there is no easy way to fix it, so just exclude it since it is
+# useless for BPF samples.
+# below we use long chain of commands, clang | opt | llvm-dis | llc,
+# to generate final object file. 'clang' compiles the source into IR
+# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
+# processing (llvm12) and IR optimizations. 'llvm-dis' converts
+# 'opt' output to IR, and finally 'llc' generates bpf byte code.
+$(obj)/%.o: $(src)/%.c
+ @echo " CLANG-bpf " $@
+ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
+ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \
+ -I$(LIBBPF_INCLUDE) \
+ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \
+ -Wno-gnu-variable-sized-type-not-at-end \
+ -Wno-address-of-packed-member -Wno-tautological-compare \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+ -fno-asynchronous-unwind-tables \
+ -I$(srctree)/samples/hid/ \
+ -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
+ $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
+ $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+ $(BTF_PAHOLE) -J $@
+endif
diff --git a/samples/hid/Makefile.target b/samples/hid/Makefile.target
new file mode 100644
index 000000000000..7621f55e2947
--- /dev/null
+++ b/samples/hid/Makefile.target
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0
+# ==========================================================================
+# Building binaries on the host system
+# Binaries are not used during the compilation of the kernel, and intended
+# to be build for target board, target board can be host of course. Added to
+# build binaries to run not on host system.
+#
+# Sample syntax
+# tprogs-y := xsk_example
+# Will compile xsk_example.c and create an executable named xsk_example
+#
+# tprogs-y := xdpsock
+# xdpsock-objs := xdpsock_1.o xdpsock_2.o
+# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable
+# xdpsock, based on xdpsock_1.o and xdpsock_2.o
+#
+# Derived from scripts/Makefile.host
+#
+__tprogs := $(sort $(tprogs-y))
+
+# C code
+# Executables compiled from a single .c file
+tprog-csingle := $(foreach m,$(__tprogs), \
+ $(if $($(m)-objs),,$(m)))
+
+# C executables linked based on several .o files
+tprog-cmulti := $(foreach m,$(__tprogs),\
+ $(if $($(m)-objs),$(m)))
+
+# Object (.o) files compiled from .c files
+tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs)))
+
+tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle))
+tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti))
+tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs))
+
+#####
+# Handle options to gcc. Support building with separate output directory
+
+_tprogc_flags = $(TPROGS_CFLAGS) \
+ $(TPROGCFLAGS_$(basetarget).o)
+
+# $(objtree)/$(obj) for including generated headers from checkin source files
+ifeq ($(KBUILD_EXTMOD),)
+ifdef building_out_of_srctree
+_tprogc_flags += -I $(objtree)/$(obj)
+endif
+endif
+
+tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags)
+
+# Create executable from a single .c file
+# tprog-csingle -> Executable
+quiet_cmd_tprog-csingle = CC $@
+ cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \
+ $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
+$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE
+ $(call if_changed_dep,tprog-csingle)
+
+# Link an executable based on list of .o files, all plain c
+# tprog-cmulti -> executable
+quiet_cmd_tprog-cmulti = LD $@
+ cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \
+ $(addprefix $(obj)/,$($(@F)-objs)) \
+ $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
+$(tprog-cmulti): $(tprog-cobjs) FORCE
+ $(call if_changed,tprog-cmulti)
+$(call multi_depend, $(tprog-cmulti), , -objs)
+
+# Create .o file from a single .c file
+# tprog-cobjs -> .o
+quiet_cmd_tprog-cobjs = CC $@
+ cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $<
+$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE
+ $(call if_changed_dep,tprog-cobjs)
diff --git a/samples/hid/hid_bpf_helpers.h b/samples/hid/hid_bpf_helpers.h
new file mode 100644
index 000000000000..4fff31dbe0e7
--- /dev/null
+++ b/samples/hid/hid_bpf_helpers.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Benjamin Tissoires
+ */
+
+#ifndef __HID_BPF_HELPERS_H
+#define __HID_BPF_HELPERS_H
+
+/* following are kfuncs exported by HID for HID-BPF */
+extern __u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx,
+ unsigned int offset,
+ const size_t __sz) __ksym;
+extern int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, u32 flags) __ksym;
+extern struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id) __ksym;
+extern void hid_bpf_release_context(struct hid_bpf_ctx *ctx) __ksym;
+extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx,
+ __u8 *data,
+ size_t buf__sz,
+ enum hid_report_type type,
+ enum hid_class_request reqtype) __ksym;
+
+#endif /* __HID_BPF_HELPERS_H */
diff --git a/samples/hid/hid_mouse.bpf.c b/samples/hid/hid_mouse.bpf.c
new file mode 100644
index 000000000000..f7f722dcf56d
--- /dev/null
+++ b/samples/hid/hid_mouse.bpf.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "hid_bpf_helpers.h"
+
+static int hid_y_event(struct hid_bpf_ctx *hctx)
+{
+ s16 y;
+ __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */);
+
+ if (!data)
+ return 0; /* EPERM check */
+
+ bpf_printk("event: size: %d", hctx->size);
+ bpf_printk("incoming event: %02x %02x %02x",
+ data[0],
+ data[1],
+ data[2]);
+ bpf_printk(" %02x %02x %02x",
+ data[3],
+ data[4],
+ data[5]);
+ bpf_printk(" %02x %02x %02x",
+ data[6],
+ data[7],
+ data[8]);
+
+ y = data[3] | (data[4] << 8);
+
+ y = -y;
+
+ data[3] = y & 0xFF;
+ data[4] = (y >> 8) & 0xFF;
+
+ bpf_printk("modified event: %02x %02x %02x",
+ data[0],
+ data[1],
+ data[2]);
+ bpf_printk(" %02x %02x %02x",
+ data[3],
+ data[4],
+ data[5]);
+ bpf_printk(" %02x %02x %02x",
+ data[6],
+ data[7],
+ data[8]);
+
+ return 0;
+}
+
+static int hid_x_event(struct hid_bpf_ctx *hctx)
+{
+ s16 x;
+ __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */);
+
+ if (!data)
+ return 0; /* EPERM check */
+
+ x = data[1] | (data[2] << 8);
+
+ x = -x;
+
+ data[1] = x & 0xFF;
+ data[2] = (x >> 8) & 0xFF;
+ return 0;
+}
+
+SEC("struct_ops/hid_device_event")
+int BPF_PROG(hid_event, struct hid_bpf_ctx *hctx, enum hid_report_type type)
+{
+ int ret = hid_y_event(hctx);
+
+ if (ret)
+ return ret;
+
+ return hid_x_event(hctx);
+}
+
+
+SEC("struct_ops/hid_rdesc_fixup")
+int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hctx)
+{
+ __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 4096 /* size */);
+
+ if (!data)
+ return 0; /* EPERM check */
+
+ bpf_printk("rdesc: %02x %02x %02x",
+ data[0],
+ data[1],
+ data[2]);
+ bpf_printk(" %02x %02x %02x",
+ data[3],
+ data[4],
+ data[5]);
+ bpf_printk(" %02x %02x %02x ...",
+ data[6],
+ data[7],
+ data[8]);
+
+ /*
+ * The original report descriptor contains:
+ *
+ * 0x05, 0x01, // Usage Page (Generic Desktop) 30
+ * 0x16, 0x01, 0x80, // Logical Minimum (-32767) 32
+ * 0x26, 0xff, 0x7f, // Logical Maximum (32767) 35
+ * 0x09, 0x30, // Usage (X) 38
+ * 0x09, 0x31, // Usage (Y) 40
+ *
+ * So byte 39 contains Usage X and byte 41 Usage Y.
+ *
+ * We simply swap the axes here.
+ */
+ data[39] = 0x31;
+ data[41] = 0x30;
+
+ return 0;
+}
+
+SEC(".struct_ops.link")
+struct hid_bpf_ops mouse_invert = {
+ .hid_rdesc_fixup = (void *)hid_rdesc_fixup,
+ .hid_device_event = (void *)hid_event,
+};
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/hid/hid_mouse.c b/samples/hid/hid_mouse.c
new file mode 100644
index 000000000000..4b80d4e4c154
--- /dev/null
+++ b/samples/hid/hid_mouse.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Benjamin Tissoires
+ *
+ * This is a pure HID-BPF example, and should be considered as such:
+ * on the Etekcity Scroll 6E, the X and Y axes will be swapped and
+ * inverted. On any other device... Not sure what this will do.
+ *
+ * This C main file is generic though. To adapt the code and test, users
+ * must amend only the .bpf.c file, which this program will load any
+ * eBPF program it finds.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <linux/errno.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "hid_mouse.skel.h"
+
+static bool running = true;
+
+static void int_exit(int sig)
+{
+ running = false;
+ exit(0);
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "%s: %s /sys/bus/hid/devices/0BUS:0VID:0PID:00ID\n\n",
+ __func__, prog);
+ fprintf(stderr,
+ "This program will upload and attach a HID-BPF program to the given device.\n"
+ "On the Etekcity Scroll 6E, the X and Y axis will be inverted, but on any other\n"
+ "device, chances are high that the device will not be working anymore\n\n"
+ "consider this as a demo and adapt the eBPF program to your needs\n"
+ "Hit Ctrl-C to unbind the program and reset the device\n");
+}
+
+static int get_hid_id(const char *path)
+{
+ const char *str_id, *dir;
+ char uevent[1024];
+ int fd;
+
+ memset(uevent, 0, sizeof(uevent));
+ snprintf(uevent, sizeof(uevent) - 1, "%s/uevent", path);
+
+ fd = open(uevent, O_RDONLY | O_NONBLOCK);
+ if (fd < 0)
+ return -ENOENT;
+
+ close(fd);
+
+ dir = basename((char *)path);
+
+ str_id = dir + sizeof("0003:0001:0A37.");
+ return (int)strtol(str_id, NULL, 16);
+}
+
+int main(int argc, char **argv)
+{
+ struct hid_mouse *skel;
+ struct bpf_link *link;
+ int err;
+ const char *optstr = "";
+ const char *sysfs_path;
+ int opt, hid_id;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ sysfs_path = argv[optind];
+ if (!sysfs_path) {
+ perror("sysfs");
+ return 1;
+ }
+
+ skel = hid_mouse__open();
+ if (!skel) {
+ fprintf(stderr, "%s %s:%d", __func__, __FILE__, __LINE__);
+ return -1;
+ }
+
+ hid_id = get_hid_id(sysfs_path);
+
+ if (hid_id < 0) {
+ fprintf(stderr, "can not open HID device: %m\n");
+ return 1;
+ }
+ skel->struct_ops.mouse_invert->hid_id = hid_id;
+
+ err = hid_mouse__load(skel);
+ if (err < 0) {
+ fprintf(stderr, "can not load HID-BPF program: %m\n");
+ return 1;
+ }
+
+ link = bpf_map__attach_struct_ops(skel->maps.mouse_invert);
+ if (!link) {
+ fprintf(stderr, "can not attach HID-BPF program: %m\n");
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ while (running)
+ sleep(1);
+
+ hid_mouse__destroy(skel);
+
+ return 0;
+}
diff --git a/samples/hid/hid_surface_dial.bpf.c b/samples/hid/hid_surface_dial.bpf.c
new file mode 100644
index 000000000000..527d584812ab
--- /dev/null
+++ b/samples/hid/hid_surface_dial.bpf.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Benjamin Tissoires
+ */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "hid_bpf_helpers.h"
+
+#define HID_UP_BUTTON 0x0009
+#define HID_GD_WHEEL 0x0038
+
+SEC("struct_ops/hid_device_event")
+int BPF_PROG(hid_event, struct hid_bpf_ctx *hctx)
+{
+ __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 9 /* size */);
+
+ if (!data)
+ return 0; /* EPERM check */
+
+ /* Touch */
+ data[1] &= 0xfd;
+
+ /* X */
+ data[4] = 0;
+ data[5] = 0;
+
+ /* Y */
+ data[6] = 0;
+ data[7] = 0;
+
+ return 0;
+}
+
+/* 72 == 360 / 5 -> 1 report every 5 degrees */
+int resolution = 72;
+int physical = 5;
+
+struct haptic_syscall_args {
+ unsigned int hid;
+ int retval;
+};
+
+static __u8 haptic_data[8];
+
+SEC("syscall")
+int set_haptic(struct haptic_syscall_args *args)
+{
+ struct hid_bpf_ctx *ctx;
+ const size_t size = sizeof(haptic_data);
+ u16 *res;
+ int ret;
+
+ if (size > sizeof(haptic_data))
+ return -7; /* -E2BIG */
+
+ ctx = hid_bpf_allocate_context(args->hid);
+ if (!ctx)
+ return -1; /* EPERM check */
+
+ haptic_data[0] = 1; /* report ID */
+
+ ret = hid_bpf_hw_request(ctx, haptic_data, size, HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
+
+ bpf_printk("probed/remove event ret value: %d", ret);
+ bpf_printk("buf: %02x %02x %02x",
+ haptic_data[0],
+ haptic_data[1],
+ haptic_data[2]);
+ bpf_printk(" %02x %02x %02x",
+ haptic_data[3],
+ haptic_data[4],
+ haptic_data[5]);
+ bpf_printk(" %02x %02x",
+ haptic_data[6],
+ haptic_data[7]);
+
+ /* whenever resolution multiplier is not 3600, we have the fixed report descriptor */
+ res = (u16 *)&haptic_data[1];
+ if (*res != 3600) {
+// haptic_data[1] = 72; /* resolution multiplier */
+// haptic_data[2] = 0; /* resolution multiplier */
+// haptic_data[3] = 0; /* Repeat Count */
+ haptic_data[4] = 3; /* haptic Auto Trigger */
+// haptic_data[5] = 5; /* Waveform Cutoff Time */
+// haptic_data[6] = 80; /* Retrigger Period */
+// haptic_data[7] = 0; /* Retrigger Period */
+ } else {
+ haptic_data[4] = 0;
+ }
+
+ ret = hid_bpf_hw_request(ctx, haptic_data, size, HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
+
+ bpf_printk("set haptic ret value: %d -> %d", ret, haptic_data[4]);
+
+ args->retval = ret;
+
+ hid_bpf_release_context(ctx);
+
+ return 0;
+}
+
+/* Convert REL_DIAL into REL_WHEEL */
+SEC("struct_ops/hid_rdesc_fixup")
+int BPF_PROG(hid_rdesc_fixup, struct hid_bpf_ctx *hctx)
+{
+ __u8 *data = hid_bpf_get_data(hctx, 0 /* offset */, 4096 /* size */);
+ __u16 *res, *phys;
+
+ if (!data)
+ return 0; /* EPERM check */
+
+ /* Convert TOUCH into a button */
+ data[31] = HID_UP_BUTTON;
+ data[33] = 2;
+
+ /* Convert REL_DIAL into REL_WHEEL */
+ data[45] = HID_GD_WHEEL;
+
+ /* Change Resolution Multiplier */
+ phys = (__u16 *)&data[61];
+ *phys = physical;
+ res = (__u16 *)&data[66];
+ *res = resolution;
+
+ /* Convert X,Y from Abs to Rel */
+ data[88] = 0x06;
+ data[98] = 0x06;
+
+ return 0;
+}
+
+SEC(".struct_ops.link")
+struct hid_bpf_ops surface_dial = {
+ .hid_rdesc_fixup = (void *)hid_rdesc_fixup,
+ .hid_device_event = (void *)hid_event,
+};
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = 1;
diff --git a/samples/hid/hid_surface_dial.c b/samples/hid/hid_surface_dial.c
new file mode 100644
index 000000000000..9dd363845a85
--- /dev/null
+++ b/samples/hid/hid_surface_dial.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Benjamin Tissoires
+ *
+ * This program will morph the Microsoft Surface Dial into a mouse,
+ * and depending on the chosen resolution enable or not the haptic feedback:
+ * - a resolution (-r) of 3600 will report 3600 "ticks" in one full rotation
+ * without haptic feedback
+ * - any other resolution will report N "ticks" in a full rotation with haptic
+ * feedback
+ *
+ * A good default for low resolution haptic scrolling is 72 (1 "tick" every 5
+ * degrees), and set to 3600 for smooth scrolling.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <linux/errno.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "hid_surface_dial.skel.h"
+
+static bool running = true;
+
+struct haptic_syscall_args {
+ unsigned int hid;
+ int retval;
+};
+
+static void int_exit(int sig)
+{
+ running = false;
+ exit(0);
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "%s: %s [OPTIONS] /sys/bus/hid/devices/0BUS:0VID:0PID:00ID\n\n"
+ " OPTIONS:\n"
+ " -r N\t set the given resolution to the device (number of ticks per 360°)\n\n",
+ __func__, prog);
+ fprintf(stderr,
+ "This program will morph the Microsoft Surface Dial into a mouse,\n"
+ "and depending on the chosen resolution enable or not the haptic feedback:\n"
+ "- a resolution (-r) of 3600 will report 3600 'ticks' in one full rotation\n"
+ " without haptic feedback\n"
+ "- any other resolution will report N 'ticks' in a full rotation with haptic\n"
+ " feedback\n"
+ "\n"
+ "A good default for low resolution haptic scrolling is 72 (1 'tick' every 5\n"
+ "degrees), and set to 3600 for smooth scrolling.\n");
+}
+
+static int get_hid_id(const char *path)
+{
+ const char *str_id, *dir;
+ char uevent[1024];
+ int fd;
+
+ memset(uevent, 0, sizeof(uevent));
+ snprintf(uevent, sizeof(uevent) - 1, "%s/uevent", path);
+
+ fd = open(uevent, O_RDONLY | O_NONBLOCK);
+ if (fd < 0)
+ return -ENOENT;
+
+ close(fd);
+
+ dir = basename((char *)path);
+
+ str_id = dir + sizeof("0003:0001:0A37.");
+ return (int)strtol(str_id, NULL, 16);
+}
+
+static int set_haptic(struct hid_surface_dial *skel, int hid_id)
+{
+ struct haptic_syscall_args args = {
+ .hid = hid_id,
+ .retval = -1,
+ };
+ int haptic_fd, err;
+ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattr,
+ .ctx_in = &args,
+ .ctx_size_in = sizeof(args),
+ );
+
+ haptic_fd = bpf_program__fd(skel->progs.set_haptic);
+ if (haptic_fd < 0) {
+ fprintf(stderr, "can't locate haptic prog: %m\n");
+ return 1;
+ }
+
+ err = bpf_prog_test_run_opts(haptic_fd, &tattr);
+ if (err) {
+ fprintf(stderr, "can't set haptic configuration to hid device %d: %m (err: %d)\n",
+ hid_id, err);
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct hid_surface_dial *skel;
+ const char *optstr = "r:";
+ struct bpf_link *link;
+ const char *sysfs_path;
+ int err, opt, hid_id, resolution = 72;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 'r':
+ {
+ char *endp = NULL;
+ long l = -1;
+
+ if (optarg) {
+ l = strtol(optarg, &endp, 10);
+ if (endp && *endp)
+ l = -1;
+ }
+
+ if (l < 0) {
+ fprintf(stderr,
+ "invalid r option %s - expecting a number\n",
+ optarg ? optarg : "");
+ exit(EXIT_FAILURE);
+ };
+
+ resolution = (int) l;
+ break;
+ }
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (optind == argc) {
+ usage(basename(argv[0]));
+ return 1;
+ }
+
+ sysfs_path = argv[optind];
+ if (!sysfs_path) {
+ perror("sysfs");
+ return 1;
+ }
+
+ skel = hid_surface_dial__open();
+ if (!skel) {
+ fprintf(stderr, "%s %s:%d", __func__, __FILE__, __LINE__);
+ return -1;
+ }
+
+ hid_id = get_hid_id(sysfs_path);
+ if (hid_id < 0) {
+ fprintf(stderr, "can not open HID device: %m\n");
+ return 1;
+ }
+
+ skel->struct_ops.surface_dial->hid_id = hid_id;
+
+ err = hid_surface_dial__load(skel);
+ if (err < 0) {
+ fprintf(stderr, "can not load HID-BPF program: %m\n");
+ return 1;
+ }
+
+ skel->data->resolution = resolution;
+ skel->data->physical = (int)(resolution / 72);
+
+ link = bpf_map__attach_struct_ops(skel->maps.surface_dial);
+ if (!link) {
+ fprintf(stderr, "can not attach HID-BPF program: %m\n");
+ return 1;
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ set_haptic(skel, hid_id);
+
+ while (running)
+ sleep(1);
+
+ hid_surface_dial__destroy(skel);
+
+ return 0;
+}
diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore
new file mode 100644
index 000000000000..5233ab63262e
--- /dev/null
+++ b/samples/hidraw/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/hid-example
diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile
index 382eeae77bd6..594d989e5486 100644
--- a/samples/hidraw/Makefile
+++ b/samples/hidraw/Makefile
@@ -1,10 +1,4 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += hid-example
-# List of programs to build
-hostprogs-y := hid-example
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-
-HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include
+userccflags += -I usr/include
diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c
index 512a7e50bcae..0f73ace3c6c3 100644
--- a/samples/hidraw/hid-example.c
+++ b/samples/hidraw/hid-example.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Hidraw Userspace Example
*
@@ -46,10 +47,14 @@ int main(int argc, char **argv)
char buf[256];
struct hidraw_report_descriptor rpt_desc;
struct hidraw_devinfo info;
+ char *device = "/dev/hidraw0";
+
+ if (argc > 1)
+ device = argv[1];
/* Open the Device with non-blocking reads. In real life,
don't use a hard coded path; use libudev instead. */
- fd = open("/dev/hidraw0", O_RDWR|O_NONBLOCK);
+ fd = open(device, O_RDWR|O_NONBLOCK);
if (fd < 0) {
perror("Unable to open device");
@@ -114,7 +119,7 @@ int main(int argc, char **argv)
if (res < 0)
perror("HIDIOCSFEATURE");
else
- printf("ioctl HIDIOCGFEATURE returned: %d\n", res);
+ printf("ioctl HIDIOCSFEATURE returned: %d\n", res);
/* Get Feature */
buf[0] = 0x9; /* Report Number */
@@ -123,7 +128,7 @@ int main(int argc, char **argv)
perror("HIDIOCGFEATURE");
} else {
printf("ioctl HIDIOCGFEATURE returned: %d\n", res);
- printf("Report data (not containing the report number):\n\t");
+ printf("Report data:\n\t");
for (i = 0; i < res; i++)
printf("%hhx ", buf[i]);
puts("\n");
diff --git a/samples/hung_task/Makefile b/samples/hung_task/Makefile
new file mode 100644
index 000000000000..86036f1a204d
--- /dev/null
+++ b/samples/hung_task/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_tests.o
diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c
new file mode 100644
index 000000000000..0360ec916890
--- /dev/null
+++ b/samples/hung_task/hung_task_tests.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * hung_task_tests.c - Sample code for testing hung tasks with mutex,
+ * semaphore, etc.
+ *
+ * Usage: Load this module and read `<debugfs>/hung_task/mutex`,
+ * `<debugfs>/hung_task/semaphore`, `<debugfs>/hung_task/rw_semaphore_read`,
+ * `<debugfs>/hung_task/rw_semaphore_write`, etc., with 2 or more processes.
+ *
+ * This is for testing kernel hung_task error messages with various locking
+ * mechanisms (e.g., mutex, semaphore, rw_semaphore_read, rw_semaphore_write, etc.).
+ * Note that this may freeze your system or cause a panic. Use only for testing purposes.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/rwsem.h>
+
+#define HUNG_TASK_DIR "hung_task"
+#define HUNG_TASK_MUTEX_FILE "mutex"
+#define HUNG_TASK_SEM_FILE "semaphore"
+#define HUNG_TASK_RWSEM_READ_FILE "rw_semaphore_read"
+#define HUNG_TASK_RWSEM_WRITE_FILE "rw_semaphore_write"
+#define SLEEP_SECOND 256
+
+static const char dummy_string[] = "This is a dummy string.";
+static DEFINE_MUTEX(dummy_mutex);
+static DEFINE_SEMAPHORE(dummy_sem, 1);
+static DECLARE_RWSEM(dummy_rwsem);
+static struct dentry *hung_task_dir;
+
+/* Mutex-based read function */
+static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* Check if data is already read */
+ if (*ppos >= sizeof(dummy_string))
+ return 0;
+
+ /* Second task waits on mutex, entering uninterruptible sleep */
+ guard(mutex)(&dummy_mutex);
+
+ /* First task sleeps here, interruptible */
+ msleep_interruptible(SLEEP_SECOND * 1000);
+
+ return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+ sizeof(dummy_string));
+}
+
+/* Semaphore-based read function */
+static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* Check if data is already read */
+ if (*ppos >= sizeof(dummy_string))
+ return 0;
+
+ /* Second task waits on semaphore, entering uninterruptible sleep */
+ down(&dummy_sem);
+
+ /* First task sleeps here, interruptible */
+ msleep_interruptible(SLEEP_SECOND * 1000);
+
+ up(&dummy_sem);
+
+ return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+ sizeof(dummy_string));
+}
+
+/* Read-write semaphore read function */
+static ssize_t read_dummy_rwsem_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* Check if data is already read */
+ if (*ppos >= sizeof(dummy_string))
+ return 0;
+
+ /* Acquires read lock, allowing concurrent readers but blocks if write lock is held */
+ down_read(&dummy_rwsem);
+
+ /* Sleeps here, potentially triggering hung task detection if lock is held too long */
+ msleep_interruptible(SLEEP_SECOND * 1000);
+
+ up_read(&dummy_rwsem);
+
+ return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+ sizeof(dummy_string));
+}
+
+/* Read-write semaphore write function */
+static ssize_t read_dummy_rwsem_write(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* Check if data is already read */
+ if (*ppos >= sizeof(dummy_string))
+ return 0;
+
+ /* Acquires exclusive write lock, blocking all other readers and writers */
+ down_write(&dummy_rwsem);
+
+ /* Sleeps here, potentially triggering hung task detection if lock is held too long */
+ msleep_interruptible(SLEEP_SECOND * 1000);
+
+ up_write(&dummy_rwsem);
+
+ return simple_read_from_buffer(user_buf, count, ppos, dummy_string,
+ sizeof(dummy_string));
+}
+
+/* File operations for mutex */
+static const struct file_operations hung_task_mutex_fops = {
+ .read = read_dummy_mutex,
+};
+
+/* File operations for semaphore */
+static const struct file_operations hung_task_sem_fops = {
+ .read = read_dummy_semaphore,
+};
+
+/* File operations for rw_semaphore read */
+static const struct file_operations hung_task_rwsem_read_fops = {
+ .read = read_dummy_rwsem_read,
+};
+
+/* File operations for rw_semaphore write */
+static const struct file_operations hung_task_rwsem_write_fops = {
+ .read = read_dummy_rwsem_write,
+};
+
+static int __init hung_task_tests_init(void)
+{
+ hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL);
+ if (IS_ERR(hung_task_dir))
+ return PTR_ERR(hung_task_dir);
+
+ /* Create debugfs files for mutex and semaphore tests */
+ debugfs_create_file(HUNG_TASK_MUTEX_FILE, 0400, hung_task_dir, NULL,
+ &hung_task_mutex_fops);
+ debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL,
+ &hung_task_sem_fops);
+ debugfs_create_file(HUNG_TASK_RWSEM_READ_FILE, 0400, hung_task_dir, NULL,
+ &hung_task_rwsem_read_fops);
+ debugfs_create_file(HUNG_TASK_RWSEM_WRITE_FILE, 0400, hung_task_dir, NULL,
+ &hung_task_rwsem_write_fops);
+
+ return 0;
+}
+
+static void __exit hung_task_tests_exit(void)
+{
+ debugfs_remove_recursive(hung_task_dir);
+}
+
+module_init(hung_task_tests_init);
+module_exit(hung_task_tests_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Masami Hiramatsu <mhiramat@kernel.org>");
+MODULE_AUTHOR("Zi Li <amaindex@outlook.com>");
+MODULE_DESCRIPTION("Simple sleep under lock files for testing hung task");
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
index 0f5c31c2fc47..ef4b6fdd7d9c 100644
--- a/samples/hw_breakpoint/Makefile
+++ b/samples/hw_breakpoint/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index ef7f32291852..fbb03b66dcbd 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* usage: insmod data_breakpoint.ko ksym=<ksym_name>
*
* This file is a kernel module that places a breakpoint over ksym_name kernel
@@ -34,9 +21,9 @@
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
-struct perf_event * __percpu *sample_hbp;
+static struct perf_event * __percpu *sample_hbp;
-static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+static char ksym_name[KSYM_NAME_LEN] = "jiffies";
module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
" write operations on the kernel symbol");
@@ -54,15 +41,19 @@ static int __init hw_break_module_init(void)
{
int ret;
struct perf_event_attr attr;
+ void *addr = __symbol_get(ksym_name);
+
+ if (!addr)
+ return -ENXIO;
hw_breakpoint_init(&attr);
- attr.bp_addr = kallsyms_lookup_name(ksym_name);
+ attr.bp_addr = (unsigned long)addr;
attr.bp_len = HW_BREAKPOINT_LEN_4;
- attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ attr.bp_type = HW_BREAKPOINT_W;
sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
- if (IS_ERR((void __force *)sample_hbp)) {
- ret = PTR_ERR((void __force *)sample_hbp);
+ if (IS_ERR_PCPU(sample_hbp)) {
+ ret = PTR_ERR_PCPU(sample_hbp);
goto fail;
}
@@ -79,6 +70,9 @@ fail:
static void __exit hw_break_module_exit(void)
{
unregister_wide_hw_breakpoint(sample_hbp);
+#ifdef CONFIG_MODULE_UNLOAD
+ __symbol_put(ksym_name);
+#endif
printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
}
diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile
index fbedf39d9356..947cb852279c 100644
--- a/samples/kdb/Makefile
+++ b/samples/kdb/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o
diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c
index c1c2fa0f62c2..82736e5a5e32 100644
--- a/samples/kdb/kdb_hello.c
+++ b/samples/kdb/kdb_hello.c
@@ -28,28 +28,26 @@ static int kdb_hello_cmd(int argc, const char **argv)
return 0;
}
+static kdbtab_t hello_cmd = {
+ .name = "hello",
+ .func = kdb_hello_cmd,
+ .usage = "[string]",
+ .help = "Say Hello World or Hello [string]",
+};
static int __init kdb_hello_cmd_init(void)
{
/*
* Registration of a dynamically added kdb command is done with
- * kdb_register() with the arguments being:
- * 1: The name of the shell command
- * 2: The function that processes the command
- * 3: Description of the usage of any arguments
- * 4: Descriptive text when you run help
- * 5: Number of characters to complete the command
- * 0 == type the whole command
- * 1 == match both "g" and "go" for example
+ * kdb_register().
*/
- kdb_register("hello", kdb_hello_cmd, "[string]",
- "Say Hello World or Hello [string]", 0);
+ kdb_register(&hello_cmd);
return 0;
}
static void __exit kdb_hello_cmd_exit(void)
{
- kdb_unregister("hello");
+ kdb_unregister(&hello_cmd);
}
module_init(kdb_hello_cmd_init);
diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile
index bcc9484a15b2..0af5250ad944 100644
--- a/samples/kfifo/Makefile
+++ b/samples/kfifo/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c
index cfe40addda76..4ae29a12cc8a 100644
--- a/samples/kfifo/bytestream-example.c
+++ b/samples/kfifo/bytestream-example.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sample kfifo byte stream implementation
*
* Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/init.h>
@@ -24,10 +22,10 @@
#define PROC_FIFO "bytestream-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -64,7 +62,7 @@ static int __init testfunc(void)
/* put values into the fifo */
for (i = 0; i != 10; i++)
- kfifo_put(&test, &i);
+ kfifo_put(&test, i);
/* show the number of used elements */
printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
@@ -85,7 +83,7 @@ static int __init testfunc(void)
kfifo_skip(&test);
/* put values into the fifo until is full */
- for (i = 20; kfifo_put(&test, &i); i++)
+ for (i = 20; kfifo_put(&test, i); i++)
;
printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
@@ -118,14 +116,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
static ssize_t fifo_read(struct file *file, char __user *buf,
@@ -134,21 +134,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
-static const struct file_operations fifo_fops = {
- .owner = THIS_MODULE,
- .read = fifo_read,
- .write = fifo_write,
- .llseek = noop_llseek,
+static const struct proc_ops fifo_proc_ops = {
+ .proc_read = fifo_read,
+ .proc_write = fifo_write,
+ .proc_lseek = noop_llseek,
};
static int __init example_init(void)
@@ -171,7 +172,7 @@ static int __init example_init(void)
return -EIO;
}
- if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
#ifdef DYNAMIC
kfifo_free(&test);
#endif
@@ -190,5 +191,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
+MODULE_DESCRIPTION("Sample kfifo byte stream implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
index 06473791c08a..8076ac410161 100644
--- a/samples/kfifo/dma-example.c
+++ b/samples/kfifo/dma-example.c
@@ -1,15 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sample fifo dma implementation
*
* Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
/*
* This module shows how to handle fifo dma operations.
@@ -39,7 +39,7 @@ static int __init example_init(void)
kfifo_in(&fifo, "test", 4);
for (i = 0; i != 9; i++)
- kfifo_put(&fifo, &i);
+ kfifo_put(&fifo, i);
/* kick away first byte */
kfifo_skip(&fifo);
@@ -75,8 +75,8 @@ static int __init example_init(void)
for (i = 0; i < nents; i++) {
printk(KERN_INFO
"sg[%d] -> "
- "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n",
- i, sg[i].page_link, sg[i].offset, sg[i].length);
+ "page %p offset 0x%.8x length 0x%.8x\n",
+ i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
if (sg_is_last(&sg[i]))
break;
@@ -104,8 +104,8 @@ static int __init example_init(void)
for (i = 0; i < nents; i++) {
printk(KERN_INFO
"sg[%d] -> "
- "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n",
- i, sg[i].page_link, sg[i].offset, sg[i].length);
+ "page %p offset 0x%.8x length 0x%.8x\n",
+ i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
if (sg_is_last(&sg[i]))
break;
@@ -139,5 +139,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
+MODULE_DESCRIPTION("Sample fifo dma implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c
index 6f8e79e76c9e..e4f93317c5d0 100644
--- a/samples/kfifo/inttype-example.c
+++ b/samples/kfifo/inttype-example.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sample kfifo int type implementation
*
* Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/init.h>
@@ -24,10 +22,10 @@
#define PROC_FIFO "int-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -61,7 +59,7 @@ static int __init testfunc(void)
/* put values into the fifo */
for (i = 0; i != 10; i++)
- kfifo_put(&test, &i);
+ kfifo_put(&test, i);
/* show the number of used elements */
printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
@@ -78,7 +76,7 @@ static int __init testfunc(void)
kfifo_skip(&test);
/* put values into the fifo until is full */
- for (i = 20; kfifo_put(&test, &i); i++)
+ for (i = 20; kfifo_put(&test, i); i++)
;
printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
@@ -111,14 +109,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
static ssize_t fifo_read(struct file *file, char __user *buf,
@@ -127,21 +127,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
-static const struct file_operations fifo_fops = {
- .owner = THIS_MODULE,
- .read = fifo_read,
- .write = fifo_write,
- .llseek = noop_llseek,
+static const struct proc_ops fifo_proc_ops = {
+ .proc_read = fifo_read,
+ .proc_write = fifo_write,
+ .proc_lseek = noop_llseek,
};
static int __init example_init(void)
@@ -162,7 +163,7 @@ static int __init example_init(void)
return -EIO;
}
- if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
#ifdef DYNAMIC
kfifo_free(&test);
#endif
@@ -181,5 +182,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
+MODULE_DESCRIPTION("Sample kfifo int type implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c
index 2d7529eeb294..e4d1a2d7983c 100644
--- a/samples/kfifo/record-example.c
+++ b/samples/kfifo/record-example.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sample dynamic sized record fifo implementation
*
* Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/init.h>
@@ -24,10 +22,10 @@
#define PROC_FIFO "record-fifo"
/* lock for procfs read access */
-static DEFINE_MUTEX(read_lock);
+static DEFINE_MUTEX(read_access);
/* lock for procfs write access */
-static DEFINE_MUTEX(write_lock);
+static DEFINE_MUTEX(write_access);
/*
* define DYNAMIC in this example for a dynamically allocated fifo.
@@ -125,14 +123,16 @@ static ssize_t fifo_write(struct file *file, const char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&write_lock))
+ if (mutex_lock_interruptible(&write_access))
return -ERESTARTSYS;
ret = kfifo_from_user(&test, buf, count, &copied);
- mutex_unlock(&write_lock);
+ mutex_unlock(&write_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
static ssize_t fifo_read(struct file *file, char __user *buf,
@@ -141,21 +141,22 @@ static ssize_t fifo_read(struct file *file, char __user *buf,
int ret;
unsigned int copied;
- if (mutex_lock_interruptible(&read_lock))
+ if (mutex_lock_interruptible(&read_access))
return -ERESTARTSYS;
ret = kfifo_to_user(&test, buf, count, &copied);
- mutex_unlock(&read_lock);
+ mutex_unlock(&read_access);
+ if (ret)
+ return ret;
- return ret ? ret : copied;
+ return copied;
}
-static const struct file_operations fifo_fops = {
- .owner = THIS_MODULE,
- .read = fifo_read,
- .write = fifo_write,
- .llseek = noop_llseek,
+static const struct proc_ops fifo_proc_ops = {
+ .proc_read = fifo_read,
+ .proc_write = fifo_write,
+ .proc_lseek = noop_llseek,
};
static int __init example_init(void)
@@ -178,7 +179,7 @@ static int __init example_init(void)
return -EIO;
}
- if (proc_create(PROC_FIFO, 0, NULL, &fifo_fops) == NULL) {
+ if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
#ifdef DYNAMIC
kfifo_free(&test);
#endif
@@ -197,5 +198,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
+MODULE_DESCRIPTION("Sample dynamic sized record fifo implementation");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile
new file mode 100644
index 000000000000..8a999ab43b6d
--- /dev/null
+++ b/samples/kmemleak/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_SAMPLE_KMEMLEAK) += kmemleak-test.o
diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c
new file mode 100644
index 000000000000..8609812a37eb
--- /dev/null
+++ b/samples/kmemleak/kmemleak-test.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * samples/kmemleak/kmemleak-test.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ */
+
+#define pr_fmt(fmt) "kmemleak: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/fdtable.h>
+
+#include <linux/kmemleak.h>
+
+struct test_node {
+ long header[25];
+ struct list_head list;
+ long footer[25];
+};
+
+static LIST_HEAD(test_list);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
+
+/*
+ * Some very simple testing. This function needs to be extended for
+ * proper testing.
+ */
+static int kmemleak_test_init(void)
+{
+ struct test_node *elem;
+ int i;
+
+ pr_info("Kmemleak testing\n");
+
+ /* make some orphan objects */
+ pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL));
+#ifndef CONFIG_MODULES
+ pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+ pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+#endif
+ pr_info("vmalloc(64) = 0x%px\n", vmalloc(64));
+ pr_info("vmalloc(64) = 0x%px\n", vmalloc(64));
+ pr_info("vmalloc(64) = 0x%px\n", vmalloc(64));
+ pr_info("vmalloc(64) = 0x%px\n", vmalloc(64));
+ pr_info("vmalloc(64) = 0x%px\n", vmalloc(64));
+
+ /*
+ * Add elements to a list. They should only appear as orphan
+ * after the module is removed.
+ */
+ for (i = 0; i < 10; i++) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+ pr_info("kzalloc(sizeof(*elem)) = 0x%px\n", elem);
+ if (!elem)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&elem->list);
+ list_add_tail(&elem->list, &test_list);
+ }
+
+ for_each_possible_cpu(i) {
+ per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
+ pr_info("kmalloc(129) = 0x%px\n",
+ per_cpu(kmemleak_test_pointer, i));
+ }
+
+ pr_info("__alloc_percpu(64, 4) = 0x%px\n", __alloc_percpu(64, 4));
+
+ return 0;
+}
+module_init(kmemleak_test_init);
+
+static void __exit kmemleak_test_exit(void)
+{
+ struct test_node *elem, *tmp;
+
+ /*
+ * Remove the list elements without actually freeing the
+ * memory.
+ */
+ list_for_each_entry_safe(elem, tmp, &test_list, list)
+ list_del(&elem->list);
+}
+module_exit(kmemleak_test_exit);
+
+MODULE_DESCRIPTION("Sample module to leak memory for kmemleak testing");
+MODULE_LICENSE("GPL");
diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile
index 4a194203c982..bb5d2199742b 100644
--- a/samples/kobject/Makefile
+++ b/samples/kobject/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o
diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c
index 86ea0c3ad975..36d87ca0bee2 100644
--- a/samples/kobject/kobject-example.c
+++ b/samples/kobject/kobject-example.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Sample kobject implementation
*
* Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2007 Novell Inc.
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/kobject.h>
#include <linux/string.h>
@@ -15,7 +13,7 @@
/*
* This module shows how to create a simple subdirectory in sysfs called
- * /sys/kernel/kobject-example In that directory, 3 files are created:
+ * /sys/kernel/kobject_example In that directory, 3 files are created:
* "foo", "baz", and "bar". If an integer is written to these files, it can be
* later read out of it.
*/
@@ -30,18 +28,24 @@ static int bar;
static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", foo);
+ return sysfs_emit(buf, "%d\n", foo);
}
static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
- sscanf(buf, "%du", &foo);
+ int ret;
+
+ ret = kstrtoint(buf, 10, &foo);
+ if (ret < 0)
+ return ret;
+
return count;
}
+/* Sysfs attributes cannot be world-writable. */
static struct kobj_attribute foo_attribute =
- __ATTR(foo, 0666, foo_show, foo_store);
+ __ATTR(foo, 0664, foo_show, foo_store);
/*
* More complex function where we determine which variable is being accessed by
@@ -56,15 +60,18 @@ static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr,
var = baz;
else
var = bar;
- return sprintf(buf, "%d\n", var);
+ return sysfs_emit(buf, "%d\n", var);
}
static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
- int var;
+ int var, ret;
+
+ ret = kstrtoint(buf, 10, &var);
+ if (ret < 0)
+ return ret;
- sscanf(buf, "%du", &var);
if (strcmp(attr->attr.name, "baz") == 0)
baz = var;
else
@@ -73,9 +80,9 @@ static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
}
static struct kobj_attribute baz_attribute =
- __ATTR(baz, 0666, b_show, b_store);
+ __ATTR(baz, 0664, b_show, b_store);
static struct kobj_attribute bar_attribute =
- __ATTR(bar, 0666, b_show, b_store);
+ __ATTR(bar, 0664, b_show, b_store);
/*
@@ -95,7 +102,7 @@ static struct attribute *attrs[] = {
* created for the attributes with the directory being the name of the
* attribute group.
*/
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = attrs,
};
@@ -133,5 +140,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
-MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Sample kobject implementation");
+MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c
index d0c687fd9802..d0103904e5dd 100644
--- a/samples/kobject/kset-example.c
+++ b/samples/kobject/kset-example.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Sample kset and ktype implementation
*
* Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2007 Novell Inc.
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/kobject.h>
#include <linux/string.h>
@@ -16,8 +14,8 @@
/*
* This module shows how to create a kset in sysfs called
- * /sys/kernel/kset-example
- * Then tree kobjects are created and assigned to this kset, "foo", "baz",
+ * /sys/kernel/kset_example
+ * Then three kobjects are created and assigned to this kset, "foo", "baz",
* and "bar". In those kobjects, attributes of the same name are also
* created and if an integer is written to these files, it can be later
* read out of it.
@@ -39,10 +37,11 @@ struct foo_obj {
/* a custom attribute that works just for a struct foo_obj. */
struct foo_attribute {
struct attribute attr;
- ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf);
- ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count);
+ ssize_t (*show)(struct foo_obj *foo, const struct foo_attribute *attr, char *buf);
+ ssize_t (*store)(struct foo_obj *foo, const struct foo_attribute *attr,
+ const char *buf, size_t count);
};
-#define to_foo_attr(x) container_of(x, struct foo_attribute, attr)
+#define to_foo_attr(x) container_of_const(x, struct foo_attribute, attr)
/*
* The default show function that must be passed to sysfs. This will be
@@ -55,7 +54,7 @@ static ssize_t foo_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- struct foo_attribute *attribute;
+ const struct foo_attribute *attribute;
struct foo_obj *foo;
attribute = to_foo_attr(attr);
@@ -75,7 +74,7 @@ static ssize_t foo_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
- struct foo_attribute *attribute;
+ const struct foo_attribute *attribute;
struct foo_obj *foo;
attribute = to_foo_attr(attr);
@@ -111,27 +110,33 @@ static void foo_release(struct kobject *kobj)
/*
* The "foo" file where the .foo variable is read from and written to.
*/
-static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
+static ssize_t foo_show(struct foo_obj *foo_obj, const struct foo_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", foo_obj->foo);
+ return sysfs_emit(buf, "%d\n", foo_obj->foo);
}
-static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
+static ssize_t foo_store(struct foo_obj *foo_obj, const struct foo_attribute *attr,
const char *buf, size_t count)
{
- sscanf(buf, "%du", &foo_obj->foo);
+ int ret;
+
+ ret = kstrtoint(buf, 10, &foo_obj->foo);
+ if (ret < 0)
+ return ret;
+
return count;
}
-static struct foo_attribute foo_attribute =
- __ATTR(foo, 0666, foo_show, foo_store);
+/* Sysfs attributes cannot be world-writable. */
+static const struct foo_attribute foo_attribute =
+ __ATTR(foo, 0664, foo_show, foo_store);
/*
* More complex function where we determine which variable is being accessed by
* looking at the attribute for the "baz" and "bar" files.
*/
-static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
+static ssize_t b_show(struct foo_obj *foo_obj, const struct foo_attribute *attr,
char *buf)
{
int var;
@@ -140,15 +145,18 @@ static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
var = foo_obj->baz;
else
var = foo_obj->bar;
- return sprintf(buf, "%d\n", var);
+ return sysfs_emit(buf, "%d\n", var);
}
-static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
+static ssize_t b_store(struct foo_obj *foo_obj, const struct foo_attribute *attr,
const char *buf, size_t count)
{
- int var;
+ int var, ret;
+
+ ret = kstrtoint(buf, 10, &var);
+ if (ret < 0)
+ return ret;
- sscanf(buf, "%du", &var);
if (strcmp(attr->attr.name, "baz") == 0)
foo_obj->baz = var;
else
@@ -156,31 +164,47 @@ static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
return count;
}
-static struct foo_attribute baz_attribute =
- __ATTR(baz, 0666, b_show, b_store);
-static struct foo_attribute bar_attribute =
- __ATTR(bar, 0666, b_show, b_store);
+static const struct foo_attribute baz_attribute =
+ __ATTR(baz, 0664, b_show, b_store);
+static const struct foo_attribute bar_attribute =
+ __ATTR(bar, 0664, b_show, b_store);
/*
* Create a group of attributes so that we can create and destroy them all
* at once.
*/
-static struct attribute *foo_default_attrs[] = {
+static const struct attribute *const foo_default_attrs[] = {
&foo_attribute.attr,
&baz_attribute.attr,
&bar_attribute.attr,
NULL, /* need to NULL terminate the list of attributes */
};
+static umode_t foo_default_attrs_is_visible(struct kobject *kobj,
+ const struct attribute *attr,
+ int n)
+{
+ /* Hide attributes with the same name as the kobject. */
+ if (strcmp(kobject_name(kobj), attr->name) == 0)
+ return 0;
+ return attr->mode;
+}
+
+static const struct attribute_group foo_default_group = {
+ .attrs_const = foo_default_attrs,
+ .is_visible_const = foo_default_attrs_is_visible,
+};
+__ATTRIBUTE_GROUPS(foo_default);
+
/*
* Our own ktype for our kobjects. Here we specify our sysfs ops, the
* release function, and the set of default attributes we want created
* whenever a kobject of this type is registered with the kernel.
*/
-static struct kobj_type foo_ktype = {
+static const struct kobj_type foo_ktype = {
.sysfs_ops = &foo_sysfs_ops,
.release = foo_release,
- .default_attrs = foo_default_attrs,
+ .default_groups = foo_default_groups,
};
static struct kset *example_kset;
@@ -262,6 +286,7 @@ baz_error:
bar_error:
destroy_foo_obj(foo_obj);
foo_error:
+ kset_unregister(example_kset);
return -EINVAL;
}
@@ -275,5 +300,6 @@ static void __exit example_exit(void)
module_init(example_init);
module_exit(example_exit);
-MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Sample kset and ktype implementation");
+MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile
index 68739bc4fc6a..e774592718d6 100644
--- a/samples/kprobes/Makefile
+++ b/samples/kprobes/Makefile
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
# builds the kprobes example kernel modules;
# then to use one (as root): insmod <module_name.ko>
-obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o jprobe_example.o
+obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o
obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o
diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c
deleted file mode 100644
index b7541355b92b..000000000000
--- a/samples/kprobes/jprobe_example.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Here's a sample kernel module showing the use of jprobes to dump
- * the arguments of do_fork().
- *
- * For more information on theory of operation of jprobes, see
- * Documentation/kprobes.txt
- *
- * Build and insert the kernel module as done in the kprobe example.
- * You will see the trace data in /var/log/messages and on the
- * console whenever do_fork() is invoked to create a new process.
- * (Some messages may be suppressed if syslogd is configured to
- * eliminate duplicate messages.)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-
-/*
- * Jumper probe for do_fork.
- * Mirror principle enables access to arguments of the probed routine
- * from the probe handler.
- */
-
-/* Proxy routine having the same arguments as actual do_fork() routine */
-static long jdo_fork(unsigned long clone_flags, unsigned long stack_start,
- struct pt_regs *regs, unsigned long stack_size,
- int __user *parent_tidptr, int __user *child_tidptr)
-{
- printk(KERN_INFO "jprobe: clone_flags = 0x%lx, stack_size = 0x%lx,"
- " regs = 0x%p\n",
- clone_flags, stack_size, regs);
-
- /* Always end with a call to jprobe_return(). */
- jprobe_return();
- return 0;
-}
-
-static struct jprobe my_jprobe = {
- .entry = jdo_fork,
- .kp = {
- .symbol_name = "do_fork",
- },
-};
-
-static int __init jprobe_init(void)
-{
- int ret;
-
- ret = register_jprobe(&my_jprobe);
- if (ret < 0) {
- printk(KERN_INFO "register_jprobe failed, returned %d\n", ret);
- return -1;
- }
- printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n",
- my_jprobe.kp.addr, my_jprobe.entry);
- return 0;
-}
-
-static void __exit jprobe_exit(void)
-{
- unregister_jprobe(&my_jprobe);
- printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr);
-}
-
-module_init(jprobe_init)
-module_exit(jprobe_exit)
-MODULE_LICENSE("GPL");
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index ebf5e0c368ea..53ec6c8b8c40 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -1,41 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
- * stack trace and selected registers when do_fork() is called.
+ * stack trace and selected registers when kernel_clone() is called.
*
* For more information on theory of operation of kprobes, see
- * Documentation/kprobes.txt
+ * Documentation/trace/kprobes.rst
*
* You will see the trace data in /var/log/messages and on the console
- * whenever do_fork() is invoked to create a new process.
+ * whenever kernel_clone() is invoked to create a new process.
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
+static char symbol[KSYM_NAME_LEN] = "kernel_clone";
+module_param_string(symbol, symbol, KSYM_NAME_LEN, 0644);
+
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
- .symbol_name = "do_fork",
+ .symbol_name = symbol,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
-static int handler_pre(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
- printk(KERN_INFO "pre_handler: p->addr = 0x%p, ip = %lx,"
- " flags = 0x%lx\n",
- p->addr, regs->ip, regs->flags);
+ pr_info("<%s> p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPC
- printk(KERN_INFO "pre_handler: p->addr = 0x%p, nip = 0x%lx,"
- " msr = 0x%lx\n",
- p->addr, regs->nip, regs->msr);
+ pr_info("<%s> p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
+ p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPS
- printk(KERN_INFO "pre_handler: p->addr = 0x%p, epc = 0x%lx,"
- " status = 0x%lx\n",
- p->addr, regs->cp0_epc, regs->cp0_status);
+ pr_info("<%s> p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
+#endif
+#ifdef CONFIG_ARM64
+ pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, pstate = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
+#endif
+#ifdef CONFIG_ARM
+ pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr);
+#endif
+#ifdef CONFIG_RISCV
+ pr_info("<%s> p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->epc, regs->status);
+#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->psw.addr, regs->flags);
+#endif
+#ifdef CONFIG_LOONGARCH
+ pr_info("<%s> p->addr = 0x%p, era = 0x%lx, estat = 0x%lx\n",
+ p->symbol_name, p->addr, regs->csr_era, regs->csr_estat);
#endif
/* A dump_stack() here will give a stack backtrace */
@@ -43,34 +65,41 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs)
}
/* kprobe post_handler: called after the probed instruction is executed */
-static void handler_post(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_X86
- printk(KERN_INFO "post_handler: p->addr = 0x%p, flags = 0x%lx\n",
- p->addr, regs->flags);
+ pr_info("<%s> p->addr = 0x%p, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPC
- printk(KERN_INFO "post_handler: p->addr = 0x%p, msr = 0x%lx\n",
- p->addr, regs->msr);
+ pr_info("<%s> p->addr = 0x%p, msr = 0x%lx\n",
+ p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPS
- printk(KERN_INFO "post_handler: p->addr = 0x%p, status = 0x%lx\n",
- p->addr, regs->cp0_status);
+ pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->cp0_status);
+#endif
+#ifdef CONFIG_ARM64
+ pr_info("<%s> p->addr = 0x%p, pstate = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->pstate);
+#endif
+#ifdef CONFIG_ARM
+ pr_info("<%s> p->addr = 0x%p, cpsr = 0x%lx\n",
+ p->symbol_name, p->addr, (long)regs->ARM_cpsr);
+#endif
+#ifdef CONFIG_RISCV
+ pr_info("<%s> p->addr = 0x%p, status = 0x%lx\n",
+ p->symbol_name, p->addr, regs->status);
+#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> p->addr, 0x%p, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->flags);
+#endif
+#ifdef CONFIG_LOONGARCH
+ pr_info("<%s> p->addr = 0x%p, estat = 0x%lx\n",
+ p->symbol_name, p->addr, regs->csr_estat);
#endif
-}
-
-/*
- * fault_handler: this is called if an exception is generated for any
- * instruction within the pre- or post-handler, or when Kprobes
- * single-steps the probed instruction.
- */
-static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
-{
- printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
- p->addr, trapnr);
- /* Return 0 because we don't handle the fault. */
- return 0;
}
static int __init kprobe_init(void)
@@ -78,23 +107,23 @@ static int __init kprobe_init(void)
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
- kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
- printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
+ pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
- printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
+ pr_info("Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
- printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
+ pr_info("kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
+MODULE_DESCRIPTION("sample kernel module showing the use of kprobes");
MODULE_LICENSE("GPL");
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c
index 1041b6731598..65d6dcafd742 100644
--- a/samples/kprobes/kretprobe_example.c
+++ b/samples/kprobes/kretprobe_example.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kretprobe_example.c
*
@@ -7,10 +8,10 @@
*
* usage: insmod kretprobe_example.ko func=<func_name>
*
- * If no func_name is specified, do_fork is instrumented
+ * If no func_name is specified, kernel_clone is instrumented
*
* For more information on theory of operation of kretprobes, see
- * Documentation/kprobes.txt
+ * Documentation/trace/kprobes.rst
*
* Build and insert the kernel module as done in the kprobe example.
* You will see the trace data in /var/log/messages and on the console
@@ -22,11 +23,10 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
-#include <linux/limits.h>
#include <linux/sched.h>
-static char func_name[NAME_MAX] = "do_fork";
-module_param_string(func, func_name, NAME_MAX, S_IRUGO);
+static char func_name[KSYM_NAME_LEN] = "kernel_clone";
+module_param_string(func, func_name, KSYM_NAME_LEN, 0644);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
" function's execution time");
@@ -35,7 +35,7 @@ struct my_data {
ktime_t entry_stamp;
};
-/* Here we use the entry_hanlder to timestamp function entry */
+/* Here we use the entry_handler to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct my_data *data;
@@ -47,6 +47,7 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
data->entry_stamp = ktime_get();
return 0;
}
+NOKPROBE_SYMBOL(entry_handler);
/*
* Return-probe handler: Log the return value and duration. Duration may turn
@@ -55,17 +56,18 @@ static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
*/
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
- int retval = regs_return_value(regs);
+ unsigned long retval = regs_return_value(regs);
struct my_data *data = (struct my_data *)ri->data;
s64 delta;
ktime_t now;
now = ktime_get();
delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
- printk(KERN_INFO "%s returned %d and took %lld ns to execute\n",
+ pr_info("%s returned %lu and took %lld ns to execute\n",
func_name, retval, (long long)delta);
return 0;
}
+NOKPROBE_SYMBOL(ret_handler);
static struct kretprobe my_kretprobe = {
.handler = ret_handler,
@@ -82,11 +84,10 @@ static int __init kretprobe_init(void)
my_kretprobe.kp.symbol_name = func_name;
ret = register_kretprobe(&my_kretprobe);
if (ret < 0) {
- printk(KERN_INFO "register_kretprobe failed, returned %d\n",
- ret);
- return -1;
+ pr_err("register_kretprobe failed, returned %d\n", ret);
+ return ret;
}
- printk(KERN_INFO "Planted return probe at %s: %p\n",
+ pr_info("Planted return probe at %s: %p\n",
my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
return 0;
}
@@ -94,14 +95,14 @@ static int __init kretprobe_init(void)
static void __exit kretprobe_exit(void)
{
unregister_kretprobe(&my_kretprobe);
- printk(KERN_INFO "kretprobe at %p unregistered\n",
- my_kretprobe.kp.addr);
+ pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr);
/* nmissed > 0 suggests that maxactive was set too low. */
- printk(KERN_INFO "Missed probing %d instances of %s\n",
+ pr_info("Missed probing %d instances of %s\n",
my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}
module_init(kretprobe_init)
module_exit(kretprobe_exit)
+MODULE_DESCRIPTION("sample kernel module showing the use of return probes");
MODULE_LICENSE("GPL");
diff --git a/samples/landlock/.gitignore b/samples/landlock/.gitignore
new file mode 100644
index 000000000000..f43668b2d318
--- /dev/null
+++ b/samples/landlock/.gitignore
@@ -0,0 +1 @@
+/sandboxer
diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile
new file mode 100644
index 000000000000..5d601e51c2eb
--- /dev/null
+++ b/samples/landlock/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+userprogs-always-y := sandboxer
+
+userccflags += -I usr/include
+
+.PHONY: all clean
+
+all:
+ $(MAKE) -C ../.. samples/landlock/
+
+clean:
+ $(MAKE) -C ../.. M=samples/landlock/ clean
diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
new file mode 100644
index 000000000000..e7af02f98208
--- /dev/null
+++ b/samples/landlock/sandboxer.c
@@ -0,0 +1,539 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Simple Landlock sandbox manager able to execute a process restricted by
+ * user-defined file system and network access control policies.
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2020 ANSSI
+ */
+
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <linux/socket.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#if defined(__GLIBC__)
+#include <linux/prctl.h>
+#endif
+
+#ifndef landlock_create_ruleset
+static inline int
+landlock_create_ruleset(const struct landlock_ruleset_attr *const attr,
+ const size_t size, const __u32 flags)
+{
+ return syscall(__NR_landlock_create_ruleset, attr, size, flags);
+}
+#endif
+
+#ifndef landlock_add_rule
+static inline int landlock_add_rule(const int ruleset_fd,
+ const enum landlock_rule_type rule_type,
+ const void *const rule_attr,
+ const __u32 flags)
+{
+ return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, rule_attr,
+ flags);
+}
+#endif
+
+#ifndef landlock_restrict_self
+static inline int landlock_restrict_self(const int ruleset_fd,
+ const __u32 flags)
+{
+ return syscall(__NR_landlock_restrict_self, ruleset_fd, flags);
+}
+#endif
+
+#define ENV_FS_RO_NAME "LL_FS_RO"
+#define ENV_FS_RW_NAME "LL_FS_RW"
+#define ENV_TCP_BIND_NAME "LL_TCP_BIND"
+#define ENV_TCP_CONNECT_NAME "LL_TCP_CONNECT"
+#define ENV_SCOPED_NAME "LL_SCOPED"
+#define ENV_FORCE_LOG_NAME "LL_FORCE_LOG"
+#define ENV_DELIMITER ":"
+
+static int str2num(const char *numstr, __u64 *num_dst)
+{
+ char *endptr = NULL;
+ int err = 0;
+ __u64 num;
+
+ errno = 0;
+ num = strtoull(numstr, &endptr, 10);
+ if (errno != 0)
+ err = errno;
+ /* Was the string empty, or not entirely parsed successfully? */
+ else if ((*numstr == '\0') || (*endptr != '\0'))
+ err = EINVAL;
+ else
+ *num_dst = num;
+
+ return err;
+}
+
+static int parse_path(char *env_path, const char ***const path_list)
+{
+ int i, num_paths = 0;
+
+ if (env_path) {
+ num_paths++;
+ for (i = 0; env_path[i]; i++) {
+ if (env_path[i] == ENV_DELIMITER[0])
+ num_paths++;
+ }
+ }
+ *path_list = malloc(num_paths * sizeof(**path_list));
+ if (!*path_list)
+ return -1;
+
+ for (i = 0; i < num_paths; i++)
+ (*path_list)[i] = strsep(&env_path, ENV_DELIMITER);
+
+ return num_paths;
+}
+
+/* clang-format off */
+
+#define ACCESS_FILE ( \
+ LANDLOCK_ACCESS_FS_EXECUTE | \
+ LANDLOCK_ACCESS_FS_WRITE_FILE | \
+ LANDLOCK_ACCESS_FS_READ_FILE | \
+ LANDLOCK_ACCESS_FS_TRUNCATE | \
+ LANDLOCK_ACCESS_FS_IOCTL_DEV)
+
+/* clang-format on */
+
+static int populate_ruleset_fs(const char *const env_var, const int ruleset_fd,
+ const __u64 allowed_access)
+{
+ int num_paths, i, ret = 1;
+ char *env_path_name;
+ const char **path_list = NULL;
+ struct landlock_path_beneath_attr path_beneath = {
+ .parent_fd = -1,
+ };
+
+ env_path_name = getenv(env_var);
+ if (!env_path_name) {
+ /* Prevents users to forget a setting. */
+ fprintf(stderr, "Missing environment variable %s\n", env_var);
+ return 1;
+ }
+ env_path_name = strdup(env_path_name);
+ unsetenv(env_var);
+ num_paths = parse_path(env_path_name, &path_list);
+ if (num_paths < 0) {
+ fprintf(stderr, "Failed to allocate memory\n");
+ goto out_free_name;
+ }
+ if (num_paths == 1 && path_list[0][0] == '\0') {
+ /*
+ * Allows to not use all possible restrictions (e.g. use
+ * LL_FS_RO without LL_FS_RW).
+ */
+ ret = 0;
+ goto out_free_name;
+ }
+
+ for (i = 0; i < num_paths; i++) {
+ struct stat statbuf;
+
+ path_beneath.parent_fd = open(path_list[i], O_PATH | O_CLOEXEC);
+ if (path_beneath.parent_fd < 0) {
+ fprintf(stderr, "Failed to open \"%s\": %s\n",
+ path_list[i], strerror(errno));
+ continue;
+ }
+ if (fstat(path_beneath.parent_fd, &statbuf)) {
+ fprintf(stderr, "Failed to stat \"%s\": %s\n",
+ path_list[i], strerror(errno));
+ close(path_beneath.parent_fd);
+ goto out_free_name;
+ }
+ path_beneath.allowed_access = allowed_access;
+ if (!S_ISDIR(statbuf.st_mode))
+ path_beneath.allowed_access &= ACCESS_FILE;
+ if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+ &path_beneath, 0)) {
+ fprintf(stderr,
+ "Failed to update the ruleset with \"%s\": %s\n",
+ path_list[i], strerror(errno));
+ close(path_beneath.parent_fd);
+ goto out_free_name;
+ }
+ close(path_beneath.parent_fd);
+ }
+ ret = 0;
+
+out_free_name:
+ free(path_list);
+ free(env_path_name);
+ return ret;
+}
+
+static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
+ const __u64 allowed_access)
+{
+ int ret = 1;
+ char *env_port_name, *env_port_name_next, *strport;
+ struct landlock_net_port_attr net_port = {
+ .allowed_access = allowed_access,
+ };
+
+ env_port_name = getenv(env_var);
+ if (!env_port_name)
+ return 0;
+ env_port_name = strdup(env_port_name);
+ unsetenv(env_var);
+
+ env_port_name_next = env_port_name;
+ while ((strport = strsep(&env_port_name_next, ENV_DELIMITER))) {
+ __u64 port;
+
+ if (strcmp(strport, "") == 0)
+ continue;
+
+ if (str2num(strport, &port)) {
+ fprintf(stderr, "Failed to parse port at \"%s\"\n",
+ strport);
+ goto out_free_name;
+ }
+ net_port.port = port;
+ if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT,
+ &net_port, 0)) {
+ fprintf(stderr,
+ "Failed to update the ruleset with port \"%llu\": %s\n",
+ net_port.port, strerror(errno));
+ goto out_free_name;
+ }
+ }
+ ret = 0;
+
+out_free_name:
+ free(env_port_name);
+ return ret;
+}
+
+/* Returns true on error, false otherwise. */
+static bool check_ruleset_scope(const char *const env_var,
+ struct landlock_ruleset_attr *ruleset_attr)
+{
+ char *env_type_scope, *env_type_scope_next, *ipc_scoping_name;
+ bool error = false;
+ bool abstract_scoping = false;
+ bool signal_scoping = false;
+
+ /* Scoping is not supported by Landlock ABI */
+ if (!(ruleset_attr->scoped &
+ (LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL)))
+ goto out_unset;
+
+ env_type_scope = getenv(env_var);
+ /* Scoping is not supported by the user */
+ if (!env_type_scope || strcmp("", env_type_scope) == 0)
+ goto out_unset;
+
+ env_type_scope = strdup(env_type_scope);
+ env_type_scope_next = env_type_scope;
+ while ((ipc_scoping_name =
+ strsep(&env_type_scope_next, ENV_DELIMITER))) {
+ if (strcmp("a", ipc_scoping_name) == 0 && !abstract_scoping) {
+ abstract_scoping = true;
+ } else if (strcmp("s", ipc_scoping_name) == 0 &&
+ !signal_scoping) {
+ signal_scoping = true;
+ } else {
+ fprintf(stderr, "Unknown or duplicate scope \"%s\"\n",
+ ipc_scoping_name);
+ error = true;
+ goto out_free_name;
+ }
+ }
+
+out_free_name:
+ free(env_type_scope);
+
+out_unset:
+ if (!abstract_scoping)
+ ruleset_attr->scoped &= ~LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET;
+ if (!signal_scoping)
+ ruleset_attr->scoped &= ~LANDLOCK_SCOPE_SIGNAL;
+
+ unsetenv(env_var);
+ return error;
+}
+
+/* clang-format off */
+
+#define ACCESS_FS_ROUGHLY_READ ( \
+ LANDLOCK_ACCESS_FS_EXECUTE | \
+ LANDLOCK_ACCESS_FS_READ_FILE | \
+ LANDLOCK_ACCESS_FS_READ_DIR)
+
+#define ACCESS_FS_ROUGHLY_WRITE ( \
+ LANDLOCK_ACCESS_FS_WRITE_FILE | \
+ LANDLOCK_ACCESS_FS_REMOVE_DIR | \
+ LANDLOCK_ACCESS_FS_REMOVE_FILE | \
+ LANDLOCK_ACCESS_FS_MAKE_CHAR | \
+ LANDLOCK_ACCESS_FS_MAKE_DIR | \
+ LANDLOCK_ACCESS_FS_MAKE_REG | \
+ LANDLOCK_ACCESS_FS_MAKE_SOCK | \
+ LANDLOCK_ACCESS_FS_MAKE_FIFO | \
+ LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
+ LANDLOCK_ACCESS_FS_MAKE_SYM | \
+ LANDLOCK_ACCESS_FS_REFER | \
+ LANDLOCK_ACCESS_FS_TRUNCATE | \
+ LANDLOCK_ACCESS_FS_IOCTL_DEV)
+
+/* clang-format on */
+
+#define LANDLOCK_ABI_LAST 7
+
+#define XSTR(s) #s
+#define STR(s) XSTR(s)
+
+/* clang-format off */
+
+static const char help[] =
+ "usage: " ENV_FS_RO_NAME "=\"...\" " ENV_FS_RW_NAME "=\"...\" "
+ "[other environment variables] %1$s <cmd> [args]...\n"
+ "\n"
+ "Execute the given command in a restricted environment.\n"
+ "Multi-valued settings (lists of ports, paths, scopes) are colon-delimited.\n"
+ "\n"
+ "Mandatory settings:\n"
+ "* " ENV_FS_RO_NAME ": paths allowed to be used in a read-only way\n"
+ "* " ENV_FS_RW_NAME ": paths allowed to be used in a read-write way\n"
+ "\n"
+ "Optional settings (when not set, their associated access check "
+ "is always allowed, which is different from an empty string which "
+ "means an empty list):\n"
+ "* " ENV_TCP_BIND_NAME ": ports allowed to bind (server)\n"
+ "* " ENV_TCP_CONNECT_NAME ": ports allowed to connect (client)\n"
+ "* " ENV_SCOPED_NAME ": actions denied on the outside of the landlock domain\n"
+ " - \"a\" to restrict opening abstract unix sockets\n"
+ " - \"s\" to restrict sending signals\n"
+ "\n"
+ "A sandboxer should not log denied access requests to avoid spamming logs, "
+ "but to test audit we can set " ENV_FORCE_LOG_NAME "=1\n"
+ "\n"
+ "Example:\n"
+ ENV_FS_RO_NAME "=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" "
+ ENV_FS_RW_NAME "=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
+ ENV_TCP_BIND_NAME "=\"9418\" "
+ ENV_TCP_CONNECT_NAME "=\"80:443\" "
+ ENV_SCOPED_NAME "=\"a:s\" "
+ "%1$s bash -i\n"
+ "\n"
+ "This sandboxer can use Landlock features up to ABI version "
+ STR(LANDLOCK_ABI_LAST) ".\n";
+
+/* clang-format on */
+
+int main(const int argc, char *const argv[], char *const *const envp)
+{
+ const char *cmd_path;
+ char *const *cmd_argv;
+ int ruleset_fd, abi;
+ char *env_port_name, *env_force_log;
+ __u64 access_fs_ro = ACCESS_FS_ROUGHLY_READ,
+ access_fs_rw = ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE;
+
+ struct landlock_ruleset_attr ruleset_attr = {
+ .handled_access_fs = access_fs_rw,
+ .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP |
+ LANDLOCK_ACCESS_NET_CONNECT_TCP,
+ .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET |
+ LANDLOCK_SCOPE_SIGNAL,
+ };
+ int supported_restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
+ int set_restrict_flags = 0;
+
+ if (argc < 2) {
+ fprintf(stderr, help, argv[0]);
+ return 1;
+ }
+
+ abi = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION);
+ if (abi < 0) {
+ const int err = errno;
+
+ perror("Failed to check Landlock compatibility");
+ switch (err) {
+ case ENOSYS:
+ fprintf(stderr,
+ "Hint: Landlock is not supported by the current kernel. "
+ "To support it, build the kernel with "
+ "CONFIG_SECURITY_LANDLOCK=y and prepend "
+ "\"landlock,\" to the content of CONFIG_LSM.\n");
+ break;
+ case EOPNOTSUPP:
+ fprintf(stderr,
+ "Hint: Landlock is currently disabled. "
+ "It can be enabled in the kernel configuration by "
+ "prepending \"landlock,\" to the content of CONFIG_LSM, "
+ "or at boot time by setting the same content to the "
+ "\"lsm\" kernel parameter.\n");
+ break;
+ }
+ return 1;
+ }
+
+ /* Best-effort security. */
+ switch (abi) {
+ case 1:
+ /*
+ * Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2
+ *
+ * Note: The "refer" operations (file renaming and linking
+ * across different directories) are always forbidden when using
+ * Landlock with ABI 1.
+ *
+ * If only ABI 1 is available, this sandboxer knowingly forbids
+ * refer operations.
+ *
+ * If a program *needs* to do refer operations after enabling
+ * Landlock, it can not use Landlock at ABI level 1. To be
+ * compatible with different kernel versions, such programs
+ * should then fall back to not restrict themselves at all if
+ * the running kernel only supports ABI 1.
+ */
+ ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER;
+ __attribute__((fallthrough));
+ case 2:
+ /* Removes LANDLOCK_ACCESS_FS_TRUNCATE for ABI < 3 */
+ ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_TRUNCATE;
+ __attribute__((fallthrough));
+ case 3:
+ /* Removes network support for ABI < 4 */
+ ruleset_attr.handled_access_net &=
+ ~(LANDLOCK_ACCESS_NET_BIND_TCP |
+ LANDLOCK_ACCESS_NET_CONNECT_TCP);
+ __attribute__((fallthrough));
+ case 4:
+ /* Removes LANDLOCK_ACCESS_FS_IOCTL_DEV for ABI < 5 */
+ ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_IOCTL_DEV;
+
+ __attribute__((fallthrough));
+ case 5:
+ /* Removes LANDLOCK_SCOPE_* for ABI < 6 */
+ ruleset_attr.scoped &= ~(LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET |
+ LANDLOCK_SCOPE_SIGNAL);
+ __attribute__((fallthrough));
+ case 6:
+ /* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */
+ supported_restrict_flags &=
+ ~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
+
+ /* Must be printed for any ABI < LANDLOCK_ABI_LAST. */
+ fprintf(stderr,
+ "Hint: You should update the running kernel "
+ "to leverage Landlock features "
+ "provided by ABI version %d (instead of %d).\n",
+ LANDLOCK_ABI_LAST, abi);
+ __attribute__((fallthrough));
+ case LANDLOCK_ABI_LAST:
+ break;
+ default:
+ fprintf(stderr,
+ "Hint: You should update this sandboxer "
+ "to leverage Landlock features "
+ "provided by ABI version %d (instead of %d).\n",
+ abi, LANDLOCK_ABI_LAST);
+ }
+ access_fs_ro &= ruleset_attr.handled_access_fs;
+ access_fs_rw &= ruleset_attr.handled_access_fs;
+
+ /* Removes bind access attribute if not supported by a user. */
+ env_port_name = getenv(ENV_TCP_BIND_NAME);
+ if (!env_port_name) {
+ ruleset_attr.handled_access_net &=
+ ~LANDLOCK_ACCESS_NET_BIND_TCP;
+ }
+ /* Removes connect access attribute if not supported by a user. */
+ env_port_name = getenv(ENV_TCP_CONNECT_NAME);
+ if (!env_port_name) {
+ ruleset_attr.handled_access_net &=
+ ~LANDLOCK_ACCESS_NET_CONNECT_TCP;
+ }
+
+ if (check_ruleset_scope(ENV_SCOPED_NAME, &ruleset_attr))
+ return 1;
+
+ /* Enables optional logs. */
+ env_force_log = getenv(ENV_FORCE_LOG_NAME);
+ if (env_force_log) {
+ if (strcmp(env_force_log, "1") != 0) {
+ fprintf(stderr, "Unknown value for " ENV_FORCE_LOG_NAME
+ " (only \"1\" is handled)\n");
+ return 1;
+ }
+ if (!(supported_restrict_flags &
+ LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON)) {
+ fprintf(stderr,
+ "Audit logs not supported by current kernel\n");
+ return 1;
+ }
+ set_restrict_flags |= LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
+ unsetenv(ENV_FORCE_LOG_NAME);
+ }
+
+ ruleset_fd =
+ landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+ if (ruleset_fd < 0) {
+ perror("Failed to create a ruleset");
+ return 1;
+ }
+
+ if (populate_ruleset_fs(ENV_FS_RO_NAME, ruleset_fd, access_fs_ro)) {
+ goto err_close_ruleset;
+ }
+ if (populate_ruleset_fs(ENV_FS_RW_NAME, ruleset_fd, access_fs_rw)) {
+ goto err_close_ruleset;
+ }
+
+ if (populate_ruleset_net(ENV_TCP_BIND_NAME, ruleset_fd,
+ LANDLOCK_ACCESS_NET_BIND_TCP)) {
+ goto err_close_ruleset;
+ }
+ if (populate_ruleset_net(ENV_TCP_CONNECT_NAME, ruleset_fd,
+ LANDLOCK_ACCESS_NET_CONNECT_TCP)) {
+ goto err_close_ruleset;
+ }
+
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("Failed to restrict privileges");
+ goto err_close_ruleset;
+ }
+ if (landlock_restrict_self(ruleset_fd, set_restrict_flags)) {
+ perror("Failed to enforce ruleset");
+ goto err_close_ruleset;
+ }
+ close(ruleset_fd);
+
+ cmd_path = argv[1];
+ cmd_argv = argv + 1;
+ fprintf(stderr, "Executing the sandboxed command...\n");
+ execvpe(cmd_path, cmd_argv, envp);
+ fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path,
+ strerror(errno));
+ fprintf(stderr, "Hint: access to the binary, the interpreter or "
+ "shared libraries may be denied.\n");
+ return 1;
+
+err_close_ruleset:
+ close(ruleset_fd);
+ return 1;
+}
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
new file mode 100644
index 000000000000..9f853eeb6140
--- /dev/null
+++ b/samples/livepatch/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000000..fadc2a85cb35
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+ pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+ msleep(sleep_secs * 1000);
+ pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ schedule_delayed_work(&work, 0);
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ cancel_delayed_work_sync(&work);
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module");
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000000..9e69d9caed25
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod livepatch_callbacks_demo
+ * rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ * livepatch-callbacks-demo.ko to observe what happens when a
+ * target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ * callback return status. Try setting up a non-zero status
+ * such as -19 (-ENODEV):
+ *
+ * # Load demo livepatch, vmlinux is patched
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # Setup next pre-patch callback to return -ENODEV
+ * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ * # Module loader refuses to load the target module
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ * livepatch-callbacks-busymod.ko, available for experimenting
+ * with livepatch (un)patch callbacks. This module contains
+ * a 'sleep_secs' parameter that parks the module on one of the
+ * functions that the livepatch demo module wants to patch.
+ * Modifying this value and tweaking the order of module loads can
+ * effectively demonstrate stalled patch transitions:
+ *
+ * # Load a target module, let it park on 'busymod_work_func' for
+ * # thirty seconds
+ * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ * # Meanwhile load the livepatch
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # ... then load and unload another target module while the
+ * # transition is in progress
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ * # Finally cleanup
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+ [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state",
+ [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init",
+ [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away",
+ [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+ if (obj->mod)
+ pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+ module_state[obj->mod->state]);
+ else
+ pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+ return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+ pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+ { }
+};
+
+static struct klp_func busymod_funcs[] = {
+ {
+ .old_name = "busymod_work_func",
+ .new_func = patched_work_func,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = NULL, /* vmlinux */
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_mod",
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_busymod",
+ .funcs = busymod_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+ return klp_enable_patch(&patch);
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks");
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000000..d1851b471ad9
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_DESCRIPTION("Live patching demo for (un)patching callbacks, support module");
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
new file mode 100644
index 000000000000..5263a2f31c48
--- /dev/null
+++ b/samples/livepatch/livepatch-sample.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * livepatch-sample.c - Kernel Live Patching Sample Module
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+/*
+ * This (dumb) live patch overrides the function that prints the
+ * kernel boot cmdline when /proc/cmdline is read.
+ *
+ * Example:
+ *
+ * $ cat /proc/cmdline
+ * <your cmdline>
+ *
+ * $ insmod livepatch-sample.ko
+ * $ cat /proc/cmdline
+ * this has been live patched
+ *
+ * $ echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled
+ * $ cat /proc/cmdline
+ * <your cmdline>
+ */
+
+#include <linux/seq_file.h>
+static int livepatch_cmdline_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", "this has been live patched");
+ return 0;
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "cmdline_proc_show",
+ .new_func = livepatch_cmdline_proc_show,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ /* name being NULL means vmlinux */
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_init(void)
+{
+ return klp_enable_patch(&patch);
+}
+
+static void livepatch_exit(void)
+{
+}
+
+module_init(livepatch_init);
+module_exit(livepatch_exit);
+MODULE_DESCRIPTION("Kernel Live Patching Sample Module");
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
new file mode 100644
index 000000000000..cbf68ca40097
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-shadow-fix1.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Fixes the memory leak introduced in livepatch-shadow-mod through the
+ * use of a shadow variable. This fix demonstrates the "extending" of
+ * short-lived data structures by patching its allocation and release
+ * functions.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+/*
+ * The constructor makes more sense together with klp_shadow_get_or_alloc().
+ * In this example, it would be safe to assign the pointer also to the shadow
+ * variable returned by klp_shadow_alloc(). But we wanted to show the more
+ * complicated use of the API.
+ */
+static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data)
+{
+ int **shadow_leak = shadow_data;
+ int **leak = ctor_data;
+
+ if (!ctor_data)
+ return -EINVAL;
+
+ *shadow_leak = *leak;
+ return 0;
+}
+
+static struct dummy *livepatch_fix1_dummy_alloc(void)
+{
+ struct dummy *d;
+ int *leak;
+ int **shadow_leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD);
+
+ /*
+ * Patch: save the extra memory location into a SV_LEAK shadow
+ * variable. A patched dummy_free routine can later fetch this
+ * pointer to handle resource release.
+ */
+ leak = kzalloc(sizeof(*leak), GFP_KERNEL);
+ if (!leak)
+ goto err_leak;
+
+ shadow_leak = klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL,
+ shadow_leak_ctor, &leak);
+ if (!shadow_leak) {
+ pr_err("%s: failed to allocate shadow variable for the leaking pointer: dummy @ %p, leak @ %p\n",
+ __func__, d, leak);
+ goto err_shadow;
+ }
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+
+err_shadow:
+ kfree(leak);
+err_leak:
+ kfree(d);
+ return NULL;
+}
+
+static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+ void *d = obj;
+ int **shadow_leak = shadow_data;
+
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, *shadow_leak);
+ kfree(*shadow_leak);
+}
+
+static void livepatch_fix1_dummy_free(struct dummy *d)
+{
+ int **shadow_leak;
+
+ /*
+ * Patch: fetch the saved SV_LEAK shadow variable, detach and
+ * free it. Note: handle cases where this shadow variable does
+ * not exist (ie, dummy structures allocated before this livepatch
+ * was loaded.)
+ */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak)
+ klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor);
+ else
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_alloc",
+ .new_func = livepatch_fix1_dummy_alloc,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix1_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix1_init(void)
+{
+ return klp_enable_patch(&patch);
+}
+
+static void livepatch_shadow_fix1_exit(void)
+{
+ /* Cleanup any existing SV_LEAK shadow variables */
+ klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor);
+}
+
+module_init(livepatch_shadow_fix1_init);
+module_exit(livepatch_shadow_fix1_exit);
+MODULE_DESCRIPTION("Live patching demo for shadow variables");
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
new file mode 100644
index 000000000000..b99122cb221f
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-shadow-fix2.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Adds functionality to livepatch-shadow-mod's in-flight data
+ * structures through a shadow variable. The livepatch patches a
+ * routine that periodically inspects data structures, incrementing a
+ * per-data-structure counter, creating the counter if needed.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+#define SV_COUNTER 2
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+static bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
+{
+ int *shadow_count;
+
+ /*
+ * Patch: handle in-flight dummy structures, if they do not
+ * already have a SV_COUNTER shadow variable, then attach a
+ * new one.
+ */
+ shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
+ sizeof(*shadow_count), GFP_NOWAIT,
+ NULL, NULL);
+ if (shadow_count)
+ *shadow_count += 1;
+
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data)
+{
+ void *d = obj;
+ int **shadow_leak = shadow_data;
+
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, *shadow_leak);
+ kfree(*shadow_leak);
+}
+
+static void livepatch_fix2_dummy_free(struct dummy *d)
+{
+ int **shadow_leak;
+ int *shadow_count;
+
+ /* Patch: copy the memory leak patch from the fix1 module. */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak)
+ klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor);
+ else
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+
+ /*
+ * Patch: fetch the SV_COUNTER shadow variable and display
+ * the final count. Detach the shadow variable.
+ */
+ shadow_count = klp_shadow_get(d, SV_COUNTER);
+ if (shadow_count) {
+ pr_info("%s: dummy @ %p, check counter = %d\n",
+ __func__, d, *shadow_count);
+ klp_shadow_free(d, SV_COUNTER, NULL);
+ }
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_check",
+ .new_func = livepatch_fix2_dummy_check,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix2_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix2_init(void)
+{
+ return klp_enable_patch(&patch);
+}
+
+static void livepatch_shadow_fix2_exit(void)
+{
+ /* Cleanup any existing SV_COUNTER shadow variables */
+ klp_shadow_free_all(SV_COUNTER, NULL);
+}
+
+module_init(livepatch_shadow_fix2_init);
+module_exit(livepatch_shadow_fix2_exit);
+MODULE_DESCRIPTION("Live patching demo for shadow variables");
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c
new file mode 100644
index 000000000000..5d83ad5a8118
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-mod.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/*
+ * livepatch-shadow-mod.c - Shadow variables, buggy module demo
+ *
+ * Purpose
+ * -------
+ *
+ * As a demonstration of livepatch shadow variable API, this module
+ * introduces memory leak behavior that livepatch modules
+ * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and
+ * enhance.
+ *
+ * WARNING - even though the livepatch-shadow-fix modules patch the
+ * memory leak, please load these modules at your own risk -- some
+ * amount of memory may leaked before the bug is patched.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - Load the buggy demonstration module:
+ *
+ * insmod samples/livepatch/livepatch-shadow-mod.ko
+ *
+ * Watch dmesg output for a few moments to see new dummy being allocated
+ * and a periodic cleanup check. (Note: a small amount of memory is
+ * being leaked.)
+ *
+ *
+ * Step 2 - Load livepatch fix1:
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix1.ko
+ *
+ * Continue watching dmesg and note that now livepatch_fix1_dummy_free()
+ * and livepatch_fix1_dummy_alloc() are logging messages about leaked
+ * memory and eventually leaks prevented.
+ *
+ *
+ * Step 3 - Load livepatch fix2 (on top of fix1):
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix2.ko
+ *
+ * This module extends functionality through shadow variables, as a new
+ * "check" counter is added to the dummy structure. Periodic dmesg
+ * messages will log these as dummies are cleaned up.
+ *
+ *
+ * Step 4 - Cleanup
+ *
+ * Unwind the demonstration by disabling the livepatch fix modules, then
+ * removing them and the demo module:
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled
+ * rmmod livepatch-shadow-fix2
+ * rmmod livepatch-shadow-fix1
+ * rmmod livepatch-shadow-mod
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/workqueue.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>");
+MODULE_DESCRIPTION("Buggy module for shadow variable demo");
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+/*
+ * Keep a list of all the dummies so we can clean up any residual ones
+ * on module exit
+ */
+static LIST_HEAD(dummy_list);
+static DEFINE_MUTEX(dummy_list_mutex);
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+static __used noinline struct dummy *dummy_alloc(void)
+{
+ struct dummy *d;
+ int *leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies + secs_to_jiffies(EXPIRE_PERIOD);
+
+ /* Oops, forgot to save leak! */
+ leak = kzalloc(sizeof(*leak), GFP_KERNEL);
+ if (!leak) {
+ kfree(d);
+ return NULL;
+ }
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+}
+
+static __used noinline void dummy_free(struct dummy *d)
+{
+ pr_info("%s: dummy @ %p, expired = %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ kfree(d);
+}
+
+static __used noinline bool dummy_check(struct dummy *d,
+ unsigned long jiffies)
+{
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+/*
+ * alloc_work_func: allocates new dummy structures, allocates additional
+ * memory, aptly named "leak", but doesn't keep
+ * permanent record of it.
+ */
+
+static void alloc_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func);
+
+static void alloc_work_func(struct work_struct *work)
+{
+ struct dummy *d;
+
+ d = dummy_alloc();
+ if (!d)
+ return;
+
+ mutex_lock(&dummy_list_mutex);
+ list_add(&d->list, &dummy_list);
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD));
+}
+
+/*
+ * cleanup_work_func: frees dummy structures. Without knownledge of
+ * "leak", it leaks the additional memory that
+ * alloc_work_func created.
+ */
+
+static void cleanup_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func);
+
+static void cleanup_work_func(struct work_struct *work)
+{
+ struct dummy *d, *tmp;
+ unsigned long j;
+
+ j = jiffies;
+ pr_info("%s: jiffies = %lx\n", __func__, j);
+
+ mutex_lock(&dummy_list_mutex);
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+
+ /* Kick out and free any expired dummies */
+ if (dummy_check(d, j)) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+ }
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD));
+}
+
+static int livepatch_shadow_mod_init(void)
+{
+ schedule_delayed_work(&alloc_dwork, secs_to_jiffies(ALLOC_PERIOD));
+ schedule_delayed_work(&cleanup_dwork, secs_to_jiffies(CLEANUP_PERIOD));
+
+ return 0;
+}
+
+static void livepatch_shadow_mod_exit(void)
+{
+ struct dummy *d, *tmp;
+
+ /* Wait for any dummies at work */
+ cancel_delayed_work_sync(&alloc_dwork);
+ cancel_delayed_work_sync(&cleanup_dwork);
+
+ /* Cleanup residual dummies */
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+}
+
+module_init(livepatch_shadow_mod_init);
+module_exit(livepatch_shadow_mod_exit);
diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore
new file mode 100644
index 000000000000..fe894bcb6a62
--- /dev/null
+++ b/samples/mei/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/mei-amt-version
diff --git a/samples/mei/Makefile b/samples/mei/Makefile
new file mode 100644
index 000000000000..c54b8a0ab04e
--- /dev/null
+++ b/samples/mei/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2012-2019, Intel Corporation. All rights reserved.
+userprogs-always-y += mei-amt-version
+
+userccflags += -I usr/include
diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c
new file mode 100644
index 000000000000..1d7254bcb44c
--- /dev/null
+++ b/samples/mei/mei-amt-version.c
@@ -0,0 +1,488 @@
+/******************************************************************************
+ * Intel Management Engine Interface (Intel MEI) Linux driver
+ * Intel MEI Interface Header
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
+ * USA
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called LICENSE.GPL.
+ *
+ * Contact Information:
+ * Intel Corporation.
+ * linux-mei@linux.intel.com
+ * http://www.intel.com
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2003 - 2012 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/mei.h>
+
+/*****************************************************************************
+ * Intel Management Engine Interface
+ *****************************************************************************/
+
+#define mei_msg(_me, fmt, ARGS...) do { \
+ if (_me->verbose) \
+ fprintf(stderr, fmt, ##ARGS); \
+} while (0)
+
+#define mei_err(_me, fmt, ARGS...) do { \
+ fprintf(stderr, "Error: " fmt, ##ARGS); \
+} while (0)
+
+struct mei {
+ uuid_le guid;
+ bool initialized;
+ bool verbose;
+ unsigned int buf_size;
+ unsigned char prot_ver;
+ int fd;
+};
+
+static void mei_deinit(struct mei *cl)
+{
+ if (cl->fd != -1)
+ close(cl->fd);
+ cl->fd = -1;
+ cl->buf_size = 0;
+ cl->prot_ver = 0;
+ cl->initialized = false;
+}
+
+static bool mei_init(struct mei *me, const uuid_le *guid,
+ unsigned char req_protocol_version, bool verbose)
+{
+ int result;
+ struct mei_client *cl;
+ struct mei_connect_client_data data;
+
+ me->verbose = verbose;
+
+ me->fd = open("/dev/mei0", O_RDWR);
+ if (me->fd == -1) {
+ mei_err(me, "Cannot establish a handle to the Intel MEI driver\n");
+ goto err;
+ }
+ memcpy(&me->guid, guid, sizeof(*guid));
+ memset(&data, 0, sizeof(data));
+ me->initialized = true;
+
+ memcpy(&data.in_client_uuid, &me->guid, sizeof(me->guid));
+ result = ioctl(me->fd, IOCTL_MEI_CONNECT_CLIENT, &data);
+ if (result) {
+ mei_err(me, "IOCTL_MEI_CONNECT_CLIENT receive message. err=%d\n", result);
+ goto err;
+ }
+ cl = &data.out_client_properties;
+ mei_msg(me, "max_message_length %d\n", cl->max_msg_length);
+ mei_msg(me, "protocol_version %d\n", cl->protocol_version);
+
+ if ((req_protocol_version > 0) &&
+ (cl->protocol_version != req_protocol_version)) {
+ mei_err(me, "Intel MEI protocol version not supported\n");
+ goto err;
+ }
+
+ me->buf_size = cl->max_msg_length;
+ me->prot_ver = cl->protocol_version;
+
+ return true;
+err:
+ mei_deinit(me);
+ return false;
+}
+
+static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer,
+ ssize_t len, unsigned long timeout)
+{
+ struct timeval tv;
+ fd_set set;
+ ssize_t rc;
+
+ tv.tv_sec = timeout / 1000;
+ tv.tv_usec = (timeout % 1000) * 1000000;
+
+ mei_msg(me, "call read length = %zd\n", len);
+
+ FD_ZERO(&set);
+ FD_SET(me->fd, &set);
+ rc = select(me->fd + 1, &set, NULL, NULL, &tv);
+ if (rc > 0 && FD_ISSET(me->fd, &set)) {
+ mei_msg(me, "have reply\n");
+ } else if (rc == 0) {
+ rc = -1;
+ mei_err(me, "read failed on timeout\n");
+ goto out;
+ } else { /* rc < 0 */
+ rc = errno;
+ mei_err(me, "read failed on select with status %zd %s\n",
+ rc, strerror(errno));
+ goto out;
+ }
+
+ rc = read(me->fd, buffer, len);
+ if (rc < 0) {
+ mei_err(me, "read failed with status %zd %s\n",
+ rc, strerror(errno));
+ goto out;
+ }
+
+ mei_msg(me, "read succeeded with result %zd\n", rc);
+
+out:
+ if (rc < 0)
+ mei_deinit(me);
+
+ return rc;
+}
+
+static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer,
+ ssize_t len, unsigned long timeout)
+{
+ ssize_t written;
+ ssize_t rc;
+
+ mei_msg(me, "call write length = %zd\n", len);
+
+ written = write(me->fd, buffer, len);
+ if (written < 0) {
+ rc = -errno;
+ mei_err(me, "write failed with status %zd %s\n",
+ written, strerror(errno));
+ goto out;
+ }
+ mei_msg(me, "write success\n");
+
+ rc = written;
+out:
+ if (rc < 0)
+ mei_deinit(me);
+
+ return rc;
+}
+
+/***************************************************************************
+ * Intel Advanced Management Technology ME Client
+ ***************************************************************************/
+
+#define AMT_MAJOR_VERSION 1
+#define AMT_MINOR_VERSION 1
+
+#define AMT_STATUS_SUCCESS 0x0
+#define AMT_STATUS_INTERNAL_ERROR 0x1
+#define AMT_STATUS_NOT_READY 0x2
+#define AMT_STATUS_INVALID_AMT_MODE 0x3
+#define AMT_STATUS_INVALID_MESSAGE_LENGTH 0x4
+
+#define AMT_STATUS_HOST_IF_EMPTY_RESPONSE 0x4000
+#define AMT_STATUS_SDK_RESOURCES 0x1004
+
+
+#define AMT_BIOS_VERSION_LEN 65
+#define AMT_VERSIONS_NUMBER 50
+#define AMT_UNICODE_STRING_LEN 20
+
+struct amt_unicode_string {
+ uint16_t length;
+ char string[AMT_UNICODE_STRING_LEN];
+} __attribute__((packed));
+
+struct amt_version_type {
+ struct amt_unicode_string description;
+ struct amt_unicode_string version;
+} __attribute__((packed));
+
+struct amt_version {
+ uint8_t major;
+ uint8_t minor;
+} __attribute__((packed));
+
+struct amt_code_versions {
+ uint8_t bios[AMT_BIOS_VERSION_LEN];
+ uint32_t count;
+ struct amt_version_type versions[AMT_VERSIONS_NUMBER];
+} __attribute__((packed));
+
+/***************************************************************************
+ * Intel Advanced Management Technology Host Interface
+ ***************************************************************************/
+
+struct amt_host_if_msg_header {
+ struct amt_version version;
+ uint16_t _reserved;
+ uint32_t command;
+ uint32_t length;
+} __attribute__((packed));
+
+struct amt_host_if_resp_header {
+ struct amt_host_if_msg_header header;
+ uint32_t status;
+ unsigned char data[];
+} __attribute__((packed));
+
+const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \
+ 0xac, 0xa8, 0x46, 0xe0, 0xff, 0x65, 0x81, 0x4c);
+
+#define AMT_HOST_IF_CODE_VERSIONS_REQUEST 0x0400001A
+#define AMT_HOST_IF_CODE_VERSIONS_RESPONSE 0x0480001A
+
+const struct amt_host_if_msg_header CODE_VERSION_REQ = {
+ .version = {AMT_MAJOR_VERSION, AMT_MINOR_VERSION},
+ ._reserved = 0,
+ .command = AMT_HOST_IF_CODE_VERSIONS_REQUEST,
+ .length = 0
+};
+
+
+struct amt_host_if {
+ struct mei mei_cl;
+ unsigned long send_timeout;
+ bool initialized;
+};
+
+
+static bool amt_host_if_init(struct amt_host_if *acmd,
+ unsigned long send_timeout, bool verbose)
+{
+ acmd->send_timeout = (send_timeout) ? send_timeout : 20000;
+ acmd->initialized = mei_init(&acmd->mei_cl, &MEI_IAMTHIF, 0, verbose);
+ return acmd->initialized;
+}
+
+static void amt_host_if_deinit(struct amt_host_if *acmd)
+{
+ mei_deinit(&acmd->mei_cl);
+ acmd->initialized = false;
+}
+
+static uint32_t amt_verify_code_versions(const struct amt_host_if_resp_header *resp)
+{
+ uint32_t status = AMT_STATUS_SUCCESS;
+ struct amt_code_versions *code_ver;
+ size_t code_ver_len;
+ uint32_t ver_type_cnt;
+ uint32_t len;
+ uint32_t i;
+
+ code_ver = (struct amt_code_versions *)resp->data;
+ /* length - sizeof(status) */
+ code_ver_len = resp->header.length - sizeof(uint32_t);
+ ver_type_cnt = code_ver_len -
+ sizeof(code_ver->bios) -
+ sizeof(code_ver->count);
+ if (code_ver->count != ver_type_cnt / sizeof(struct amt_version_type)) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+
+ for (i = 0; i < code_ver->count; i++) {
+ len = code_ver->versions[i].description.length;
+
+ if (len > AMT_UNICODE_STRING_LEN) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+
+ len = code_ver->versions[i].version.length;
+ if (code_ver->versions[i].version.string[len] != '\0' ||
+ len != strlen(code_ver->versions[i].version.string)) {
+ status = AMT_STATUS_INTERNAL_ERROR;
+ goto out;
+ }
+ }
+out:
+ return status;
+}
+
+static uint32_t amt_verify_response_header(uint32_t command,
+ const struct amt_host_if_msg_header *resp_hdr,
+ uint32_t response_size)
+{
+ if (response_size < sizeof(struct amt_host_if_resp_header)) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (response_size != (resp_hdr->length +
+ sizeof(struct amt_host_if_msg_header))) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->command != command) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->_reserved != 0) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ } else if (resp_hdr->version.major != AMT_MAJOR_VERSION ||
+ resp_hdr->version.minor < AMT_MINOR_VERSION) {
+ return AMT_STATUS_INTERNAL_ERROR;
+ }
+ return AMT_STATUS_SUCCESS;
+}
+
+static uint32_t amt_host_if_call(struct amt_host_if *acmd,
+ const unsigned char *command, ssize_t command_sz,
+ uint8_t **read_buf, uint32_t rcmd,
+ unsigned int expected_sz)
+{
+ uint32_t in_buf_sz;
+ ssize_t out_buf_sz;
+ ssize_t written;
+ uint32_t status;
+ struct amt_host_if_resp_header *msg_hdr;
+
+ in_buf_sz = acmd->mei_cl.buf_size;
+ *read_buf = (uint8_t *)malloc(sizeof(uint8_t) * in_buf_sz);
+ if (*read_buf == NULL)
+ return AMT_STATUS_SDK_RESOURCES;
+ memset(*read_buf, 0, in_buf_sz);
+ msg_hdr = (struct amt_host_if_resp_header *)*read_buf;
+
+ written = mei_send_msg(&acmd->mei_cl,
+ command, command_sz, acmd->send_timeout);
+ if (written != command_sz)
+ return AMT_STATUS_INTERNAL_ERROR;
+
+ out_buf_sz = mei_recv_msg(&acmd->mei_cl, *read_buf, in_buf_sz, 2000);
+ if (out_buf_sz <= 0)
+ return AMT_STATUS_HOST_IF_EMPTY_RESPONSE;
+
+ status = msg_hdr->status;
+ if (status != AMT_STATUS_SUCCESS)
+ return status;
+
+ status = amt_verify_response_header(rcmd,
+ &msg_hdr->header, out_buf_sz);
+ if (status != AMT_STATUS_SUCCESS)
+ return status;
+
+ if (expected_sz && expected_sz != out_buf_sz)
+ return AMT_STATUS_INTERNAL_ERROR;
+
+ return AMT_STATUS_SUCCESS;
+}
+
+
+static uint32_t amt_get_code_versions(struct amt_host_if *cmd,
+ struct amt_code_versions *versions)
+{
+ struct amt_host_if_resp_header *response = NULL;
+ uint32_t status;
+
+ status = amt_host_if_call(cmd,
+ (const unsigned char *)&CODE_VERSION_REQ,
+ sizeof(CODE_VERSION_REQ),
+ (uint8_t **)&response,
+ AMT_HOST_IF_CODE_VERSIONS_RESPONSE, 0);
+
+ if (status != AMT_STATUS_SUCCESS)
+ goto out;
+
+ status = amt_verify_code_versions(response);
+ if (status != AMT_STATUS_SUCCESS)
+ goto out;
+
+ memcpy(versions, response->data, sizeof(struct amt_code_versions));
+out:
+ if (response != NULL)
+ free(response);
+
+ return status;
+}
+
+/************************** end of amt_host_if_command ***********************/
+int main(int argc, char **argv)
+{
+ struct amt_code_versions ver;
+ struct amt_host_if acmd;
+ unsigned int i;
+ uint32_t status;
+ int ret;
+ bool verbose;
+
+ verbose = (argc > 1 && strcmp(argv[1], "-v") == 0);
+
+ if (!amt_host_if_init(&acmd, 5000, verbose)) {
+ ret = 1;
+ goto out;
+ }
+
+ status = amt_get_code_versions(&acmd, &ver);
+
+ amt_host_if_deinit(&acmd);
+
+ switch (status) {
+ case AMT_STATUS_HOST_IF_EMPTY_RESPONSE:
+ printf("Intel AMT: DISABLED\n");
+ ret = 0;
+ break;
+ case AMT_STATUS_SUCCESS:
+ printf("Intel AMT: ENABLED\n");
+ for (i = 0; i < ver.count; i++) {
+ printf("%s:\t%s\n", ver.versions[i].description.string,
+ ver.versions[i].version.string);
+ }
+ ret = 0;
+ break;
+ default:
+ printf("An error has occurred\n");
+ ret = 1;
+ break;
+ }
+
+out:
+ return ret;
+}
diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore
new file mode 100644
index 000000000000..6a718eec71f4
--- /dev/null
+++ b/samples/nitro_enclaves/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+/ne_ioctl_sample
diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile
new file mode 100644
index 000000000000..a3ec78fefb52
--- /dev/null
+++ b/samples/nitro_enclaves/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample
+# usage.
+
+.PHONY: all clean
+
+CFLAGS += -Wall
+
+all:
+ $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread
+
+clean:
+ rm -f ne_ioctl_sample
diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c
new file mode 100644
index 000000000000..765b131c7319
--- /dev/null
+++ b/samples/nitro_enclaves/ne_ioctl_sample.c
@@ -0,0 +1,882 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+
+/**
+ * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE)
+ * kernel driver.
+ *
+ * Usage
+ * -----
+ *
+ * Load the nitro_enclaves module, setting also the enclave CPU pool. The
+ * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its
+ * siblings have to remain available for the primary / parent VM, so they
+ * cannot be included in the enclave CPU pool.
+ *
+ * See the cpu list section from the kernel documentation.
+ * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists
+ *
+ * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko
+ * lsmod
+ *
+ * The CPU pool can be set at runtime, after the kernel module is loaded.
+ *
+ * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus
+ *
+ * NUMA and CPU siblings information can be found using:
+ *
+ * lscpu
+ * /proc/cpuinfo
+ *
+ * Check the online / offline CPU list. The CPUs from the pool should be
+ * offlined.
+ *
+ * lscpu
+ *
+ * Check dmesg for any warnings / errors through the NE driver lifetime / usage.
+ * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern.
+ *
+ * dmesg
+ *
+ * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as
+ * the enclave CPUs.
+ *
+ * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html
+ *
+ * By default, the allocation of hugetlb pages are distributed on all possible
+ * NUMA nodes. Use the following configuration files to set the number of huge
+ * pages from a NUMA node:
+ *
+ * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages
+ * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages
+ *
+ * or, if not on a system with multiple NUMA nodes, can also set the number
+ * of 2 MiB / 1 GiB huge pages using
+ *
+ * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
+ *
+ * In this example 256 hugepages of 2 MiB are used.
+ *
+ * Build and run the NE sample.
+ *
+ * make -C samples/nitro_enclaves clean
+ * make -C samples/nitro_enclaves
+ * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image>
+ *
+ * Unload the nitro_enclaves module.
+ *
+ * rmmod nitro_enclaves
+ * lsmod
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <poll.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/mman.h>
+#include <linux/nitro_enclaves.h>
+#include <linux/vm_sockets.h>
+
+/**
+ * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface.
+ */
+#define NE_DEV_NAME "/dev/nitro_enclaves"
+
+/**
+ * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event.
+ */
+#define NE_POLL_WAIT_TIME (60)
+/**
+ * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event.
+ */
+#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000)
+
+/**
+ * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive.
+ */
+#define NE_SLEEP_TIME (300)
+
+/**
+ * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave.
+ */
+#define NE_DEFAULT_NR_VCPUS (2)
+
+/**
+ * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB.
+ */
+#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024)
+
+/**
+ * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for
+ * an enclave.
+ */
+#define NE_DEFAULT_NR_MEM_REGIONS (256)
+
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_CID (3)
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000)
+/**
+ * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading.
+ */
+#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7)
+
+/**
+ * struct ne_user_mem_region - User space memory region set for an enclave.
+ * @userspace_addr: Address of the user space memory region.
+ * @memory_size: Size of the user space memory region.
+ */
+struct ne_user_mem_region {
+ void *userspace_addr;
+ size_t memory_size;
+};
+
+/**
+ * ne_create_vm() - Create a slot for the enclave VM.
+ * @ne_dev_fd: The file descriptor of the NE misc device.
+ * @slot_uid: The generated slot uid for the enclave.
+ * @enclave_fd : The generated file descriptor for the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd)
+{
+ int rc = -EINVAL;
+ *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid);
+
+ if (*enclave_fd < 0) {
+ rc = *enclave_fd;
+ switch (errno) {
+ case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
+ printf("Error in create VM, no CPUs available in the NE CPU pool\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in create VM [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_poll_enclave_fd() - Thread function for polling the enclave fd.
+ * @data: Argument provided for the polling function.
+ *
+ * Context: Process context.
+ * Return:
+ * * NULL on success / failure.
+ */
+void *ne_poll_enclave_fd(void *data)
+{
+ int enclave_fd = *(int *)data;
+ struct pollfd fds[1] = {};
+ int i = 0;
+ int rc = -EINVAL;
+
+ printf("Running from poll thread, enclave fd %d\n", enclave_fd);
+
+ fds[0].fd = enclave_fd;
+ fds[0].events = POLLIN | POLLERR | POLLHUP;
+
+ /* Keep on polling until the current process is terminated. */
+ while (1) {
+ printf("[iter %d] Polling ...\n", i);
+
+ rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
+ if (rc < 0) {
+ printf("Error in poll [%m]\n");
+
+ return NULL;
+ }
+
+ i++;
+
+ if (!rc) {
+ printf("Poll: %d seconds elapsed\n",
+ i * NE_POLL_WAIT_TIME);
+
+ continue;
+ }
+
+ printf("Poll received value 0x%x\n", fds[0].revents);
+
+ if (fds[0].revents & POLLHUP) {
+ printf("Received POLLHUP\n");
+
+ return NULL;
+ }
+
+ if (fds[0].revents & POLLNVAL) {
+ printf("Received POLLNVAL\n");
+
+ return NULL;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave.
+ * @ne_user_mem_region: User space memory region allocated using hugetlbfs.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region)
+{
+ /**
+ * Check available hugetlb encodings for different huge page sizes in
+ * include/uapi/linux/mman.h.
+ */
+ ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS |
+ MAP_HUGETLB | MAP_HUGE_2MB, -1, 0);
+ if (ne_user_mem_region->userspace_addr == MAP_FAILED) {
+ printf("Error in mmap memory [%m]\n");
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_load_enclave_image() - Place the enclave image in the enclave memory.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @ne_user_mem_regions: User space memory regions allocated for the enclave.
+ * @enclave_image_path : The file path of the enclave image.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[],
+ char *enclave_image_path)
+{
+ unsigned char *enclave_image = NULL;
+ int enclave_image_fd = -1;
+ size_t enclave_image_size = 0;
+ size_t enclave_memory_size = 0;
+ unsigned long i = 0;
+ size_t image_written_bytes = 0;
+ struct ne_image_load_info image_load_info = {
+ .flags = NE_EIF_IMAGE,
+ };
+ struct stat image_stat_buf = {};
+ int rc = -EINVAL;
+ size_t temp_image_offset = 0;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
+ enclave_memory_size += ne_user_mem_regions[i].memory_size;
+
+ rc = stat(enclave_image_path, &image_stat_buf);
+ if (rc < 0) {
+ printf("Error in get image stat info [%m]\n");
+
+ return rc;
+ }
+
+ enclave_image_size = image_stat_buf.st_size;
+
+ if (enclave_memory_size < enclave_image_size) {
+ printf("The enclave memory is smaller than the enclave image size\n");
+
+ return -ENOMEM;
+ }
+
+ rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in get image load info, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in get image load info, provided invalid flag\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in get image load info [%m]\n");
+ }
+
+ return rc;
+ }
+
+ printf("Enclave image offset in enclave memory is %lld\n",
+ image_load_info.memory_offset);
+
+ enclave_image_fd = open(enclave_image_path, O_RDONLY);
+ if (enclave_image_fd < 0) {
+ printf("Error in open enclave image file [%m]\n");
+
+ return enclave_image_fd;
+ }
+
+ enclave_image = mmap(NULL, enclave_image_size, PROT_READ,
+ MAP_PRIVATE, enclave_image_fd, 0);
+ if (enclave_image == MAP_FAILED) {
+ printf("Error in mmap enclave image [%m]\n");
+
+ return -1;
+ }
+
+ temp_image_offset = image_load_info.memory_offset;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ size_t bytes_to_write = 0;
+ size_t memory_offset = 0;
+ size_t memory_size = ne_user_mem_regions[i].memory_size;
+ size_t remaining_bytes = 0;
+ void *userspace_addr = ne_user_mem_regions[i].userspace_addr;
+
+ if (temp_image_offset >= memory_size) {
+ temp_image_offset -= memory_size;
+
+ continue;
+ } else if (temp_image_offset != 0) {
+ memory_offset = temp_image_offset;
+ memory_size -= temp_image_offset;
+ temp_image_offset = 0;
+ }
+
+ remaining_bytes = enclave_image_size - image_written_bytes;
+ bytes_to_write = memory_size < remaining_bytes ?
+ memory_size : remaining_bytes;
+
+ memcpy(userspace_addr + memory_offset,
+ enclave_image + image_written_bytes, bytes_to_write);
+
+ image_written_bytes += bytes_to_write;
+
+ if (image_written_bytes == enclave_image_size)
+ break;
+ }
+
+ munmap(enclave_image, enclave_image_size);
+
+ close(enclave_image_fd);
+
+ return 0;
+}
+
+/**
+ * ne_set_user_mem_region() - Set a user space memory region for the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @ne_user_mem_region : User space memory region to be set for the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region)
+{
+ struct ne_user_memory_region mem_region = {
+ .flags = NE_DEFAULT_MEMORY_REGION,
+ .memory_size = ne_user_mem_region.memory_size,
+ .userspace_addr = (__u64)ne_user_mem_region.userspace_addr,
+ };
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in set user memory region, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_MEM_REGION_SIZE: {
+ printf("Error in set user memory region, mem size not multiple of 2 MiB\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_MEM_REGION_ADDR: {
+ printf("Error in set user memory region, invalid user space address\n");
+
+ break;
+ }
+
+ case NE_ERR_UNALIGNED_MEM_REGION_ADDR: {
+ printf("Error in set user memory region, unaligned user space address\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_REGION_ALREADY_USED: {
+ printf("Error in set user memory region, memory region already used\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_NOT_HUGE_PAGE: {
+ printf("Error in set user memory region, not backed by huge pages\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_DIFFERENT_NUMA_NODE: {
+ printf("Error in set user memory region, different NUMA node than CPUs\n");
+
+ break;
+ }
+
+ case NE_ERR_MEM_MAX_REGIONS: {
+ printf("Error in set user memory region, max memory regions reached\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_PAGE_SIZE: {
+ printf("Error in set user memory region, has page not multiple of 2 MiB\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in set user memory region, provided invalid flag\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in set user memory region [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_free_mem_regions() - Unmap all the user space memory regions that were set
+ * aside for the enclave.
+ * @ne_user_mem_regions: The user space memory regions associated with an enclave.
+ *
+ * Context: Process context.
+ */
+static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[])
+{
+ unsigned int i = 0;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
+ munmap(ne_user_mem_regions[i].userspace_addr,
+ ne_user_mem_regions[i].memory_size);
+}
+
+/**
+ * ne_add_vcpu() - Add a vCPU to the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @vcpu_id: vCPU id to be set for the enclave, either provided or
+ * auto-generated (if provided vCPU id is 0).
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id)
+{
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
+ printf("Error in add vcpu, no CPUs available in the NE CPU pool\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_ALREADY_USED: {
+ printf("Error in add vcpu, the provided vCPU is already used\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_NOT_IN_CPU_POOL: {
+ printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n");
+
+ break;
+ }
+
+ case NE_ERR_VCPU_INVALID_CPU_CORE: {
+ printf("Error in add vcpu, the core id of the provided vCPU is invalid\n");
+
+ break;
+ }
+
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in add vcpu, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_VCPU: {
+ printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in add vcpu [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_start_enclave() - Start the given enclave.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info)
+{
+ int rc = -EINVAL;
+
+ rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info);
+ if (rc < 0) {
+ switch (errno) {
+ case NE_ERR_NOT_IN_INIT_STATE: {
+ printf("Error in start enclave, enclave not in init state\n");
+
+ break;
+ }
+
+ case NE_ERR_NO_MEM_REGIONS_ADDED: {
+ printf("Error in start enclave, no memory regions have been added\n");
+
+ break;
+ }
+
+ case NE_ERR_NO_VCPUS_ADDED: {
+ printf("Error in start enclave, no vCPUs have been added\n");
+
+ break;
+ }
+
+ case NE_ERR_FULL_CORES_NOT_USED: {
+ printf("Error in start enclave, enclave has no full cores set\n");
+
+ break;
+ }
+
+ case NE_ERR_ENCLAVE_MEM_MIN_SIZE: {
+ printf("Error in start enclave, enclave memory is less than min size\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_FLAG_VALUE: {
+ printf("Error in start enclave, provided invalid flag\n");
+
+ break;
+ }
+
+ case NE_ERR_INVALID_ENCLAVE_CID: {
+ printf("Error in start enclave, provided invalid enclave CID\n");
+
+ break;
+ }
+
+ default:
+ printf("Error in start enclave [%m]\n");
+ }
+
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * ne_start_enclave_check_booted() - Start the enclave and wait for a heartbeat
+ * from it, on a newly created vsock channel,
+ * to check it has booted.
+ * @enclave_fd : The file descriptor associated with the enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 on success.
+ * * Negative return value on failure.
+ */
+static int ne_start_enclave_check_booted(int enclave_fd)
+{
+ struct sockaddr_vm client_vsock_addr = {};
+ int client_vsock_fd = -1;
+ socklen_t client_vsock_len = sizeof(client_vsock_addr);
+ struct ne_enclave_start_info enclave_start_info = {};
+ struct pollfd fds[1] = {};
+ int rc = -EINVAL;
+ unsigned char recv_buf = 0;
+ struct sockaddr_vm server_vsock_addr = {
+ .svm_family = AF_VSOCK,
+ .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID,
+ .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT,
+ };
+ int server_vsock_fd = -1;
+
+ server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+ if (server_vsock_fd < 0) {
+ rc = server_vsock_fd;
+
+ printf("Error in socket [%m]\n");
+
+ return rc;
+ }
+
+ rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr,
+ sizeof(server_vsock_addr));
+ if (rc < 0) {
+ printf("Error in bind [%m]\n");
+
+ goto out;
+ }
+
+ rc = listen(server_vsock_fd, 1);
+ if (rc < 0) {
+ printf("Error in listen [%m]\n");
+
+ goto out;
+ }
+
+ rc = ne_start_enclave(enclave_fd, &enclave_start_info);
+ if (rc < 0)
+ goto out;
+
+ printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid);
+
+ fds[0].fd = server_vsock_fd;
+ fds[0].events = POLLIN;
+
+ rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
+ if (rc < 0) {
+ printf("Error in poll [%m]\n");
+
+ goto out;
+ }
+
+ if (!rc) {
+ printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME);
+
+ rc = -ETIMEDOUT;
+
+ goto out;
+ }
+
+ if ((fds[0].revents & POLLIN) == 0) {
+ printf("Poll received value %d\n", fds[0].revents);
+
+ rc = -EINVAL;
+
+ goto out;
+ }
+
+ rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr,
+ &client_vsock_len);
+ if (rc < 0) {
+ printf("Error in accept [%m]\n");
+
+ goto out;
+ }
+
+ client_vsock_fd = rc;
+
+ /*
+ * Read the heartbeat value that the init process in the enclave sends
+ * after vsock connect.
+ */
+ rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf));
+ if (rc < 0) {
+ printf("Error in read [%m]\n");
+
+ goto out;
+ }
+
+ if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) {
+ printf("Read %d instead of %d\n", recv_buf,
+ NE_IMAGE_LOAD_HEARTBEAT_VALUE);
+
+ goto out;
+ }
+
+ /* Write the heartbeat value back. */
+ rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf));
+ if (rc < 0) {
+ printf("Error in write [%m]\n");
+
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+ close(server_vsock_fd);
+
+ return rc;
+}
+
+int main(int argc, char *argv[])
+{
+ int enclave_fd = -1;
+ unsigned int i = 0;
+ int ne_dev_fd = -1;
+ struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {};
+ unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {};
+ int rc = -EINVAL;
+ pthread_t thread_id = 0;
+ unsigned long slot_uid = 0;
+
+ if (argc != 2) {
+ printf("Usage: %s <path_to_enclave_image>\n", argv[0]);
+
+ exit(EXIT_FAILURE);
+ }
+
+ if (strlen(argv[1]) >= PATH_MAX) {
+ printf("The size of the path to enclave image is higher than max path\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+ ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC);
+ if (ne_dev_fd < 0) {
+ printf("Error in open NE device [%m]\n");
+
+ exit(EXIT_FAILURE);
+ }
+
+ printf("Creating enclave slot ...\n");
+
+ rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd);
+
+ close(ne_dev_fd);
+
+ if (rc < 0)
+ exit(EXIT_FAILURE);
+
+ printf("Enclave fd %d\n", enclave_fd);
+
+ rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd);
+ if (rc < 0) {
+ printf("Error in thread create [%m]\n");
+
+ close(enclave_fd);
+
+ exit(EXIT_FAILURE);
+ }
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE;
+
+ rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]);
+ if (rc < 0) {
+ printf("Error in alloc userspace memory region, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+ }
+
+ rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]);
+ if (rc < 0)
+ goto release_enclave_fd;
+
+ for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
+ rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]);
+ if (rc < 0) {
+ printf("Error in set memory region, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+ }
+
+ printf("Enclave memory regions were added\n");
+
+ for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) {
+ /*
+ * The vCPU is chosen from the enclave vCPU pool, if the value
+ * of the vcpu_id is 0.
+ */
+ ne_vcpus[i] = 0;
+ rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]);
+ if (rc < 0) {
+ printf("Error in add vcpu, iter %d\n", i);
+
+ goto release_enclave_fd;
+ }
+
+ printf("Added vCPU %d to the enclave\n", ne_vcpus[i]);
+ }
+
+ printf("Enclave vCPUs were added\n");
+
+ rc = ne_start_enclave_check_booted(enclave_fd);
+ if (rc < 0) {
+ printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc);
+
+ goto release_enclave_fd;
+ }
+
+ printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME);
+
+ sleep(NE_SLEEP_TIME);
+
+ close(enclave_fd);
+
+ ne_free_mem_regions(ne_user_mem_regions);
+
+ exit(EXIT_SUCCESS);
+
+release_enclave_fd:
+ close(enclave_fd);
+ ne_free_mem_regions(ne_user_mem_regions);
+
+ exit(EXIT_FAILURE);
+}
diff --git a/samples/pfsm/.gitignore b/samples/pfsm/.gitignore
new file mode 100644
index 000000000000..f350a030a060
--- /dev/null
+++ b/samples/pfsm/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+/pfsm-wakeup
diff --git a/samples/pfsm/Makefile b/samples/pfsm/Makefile
new file mode 100644
index 000000000000..213e8d9f5dbc
--- /dev/null
+++ b/samples/pfsm/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += pfsm-wakeup
+
+userccflags += -I usr/include
diff --git a/samples/pfsm/pfsm-wakeup.c b/samples/pfsm/pfsm-wakeup.c
new file mode 100644
index 000000000000..299dd9e1f607
--- /dev/null
+++ b/samples/pfsm/pfsm-wakeup.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TPS6594 PFSM userspace example
+ *
+ * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/
+ *
+ * This example shows how to use PFSMs from a userspace application,
+ * on TI j721s2 platform. The PMIC is armed to be triggered by a RTC
+ * alarm to execute state transition (RETENTION to ACTIVE).
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <linux/rtc.h>
+#include <linux/tps6594_pfsm.h>
+
+#define ALARM_DELTA_SEC 30
+
+#define RTC_A "/dev/rtc0"
+
+#define PMIC_NB 3
+#define PMIC_A "/dev/pfsm-0-0x48"
+#define PMIC_B "/dev/pfsm-0-0x4c"
+#define PMIC_C "/dev/pfsm-2-0x58"
+
+static const char * const dev_pfsm[] = {PMIC_A, PMIC_B, PMIC_C};
+
+int main(int argc, char *argv[])
+{
+ int i, ret, fd_rtc, fd_pfsm[PMIC_NB] = { 0 };
+ struct rtc_time rtc_tm;
+ struct pmic_state_opt pmic_opt = { 0 };
+ unsigned long data;
+
+ fd_rtc = open(RTC_A, O_RDONLY);
+ if (fd_rtc < 0) {
+ perror("Failed to open RTC device.");
+ goto out;
+ }
+
+ for (i = 0 ; i < PMIC_NB ; i++) {
+ fd_pfsm[i] = open(dev_pfsm[i], O_RDWR);
+ if (fd_pfsm[i] < 0) {
+ perror("Failed to open PFSM device.");
+ goto out;
+ }
+ }
+
+ /* Read RTC date/time */
+ ret = ioctl(fd_rtc, RTC_RD_TIME, &rtc_tm);
+ if (ret < 0) {
+ perror("Failed to read RTC date/time.");
+ goto out;
+ }
+ printf("Current RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n",
+ rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900,
+ rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+
+ /* Set RTC alarm to ALARM_DELTA_SEC sec in the future, and check for rollover */
+ rtc_tm.tm_sec += ALARM_DELTA_SEC;
+ if (rtc_tm.tm_sec >= 60) {
+ rtc_tm.tm_sec %= 60;
+ rtc_tm.tm_min++;
+ }
+ if (rtc_tm.tm_min == 60) {
+ rtc_tm.tm_min = 0;
+ rtc_tm.tm_hour++;
+ }
+ if (rtc_tm.tm_hour == 24)
+ rtc_tm.tm_hour = 0;
+ ret = ioctl(fd_rtc, RTC_ALM_SET, &rtc_tm);
+ if (ret < 0) {
+ perror("Failed to set RTC alarm.");
+ goto out;
+ }
+
+ /* Enable alarm interrupts */
+ ret = ioctl(fd_rtc, RTC_AIE_ON, 0);
+ if (ret < 0) {
+ perror("Failed to enable alarm interrupts.");
+ goto out;
+ }
+ printf("Waiting %d seconds for alarm...\n", ALARM_DELTA_SEC);
+
+ /*
+ * Set RETENTION state with options for PMIC_C/B/A respectively.
+ * Since PMIC_A is master, it should be the last one to be configured.
+ */
+ pmic_opt.ddr_retention = 1;
+ for (i = PMIC_NB - 1 ; i >= 0 ; i--) {
+ printf("Set RETENTION state for PMIC_%d.\n", i);
+ sleep(1);
+ ret = ioctl(fd_pfsm[i], PMIC_SET_RETENTION_STATE, &pmic_opt);
+ if (ret < 0) {
+ perror("Failed to set RETENTION state.");
+ goto out_reset;
+ }
+ }
+
+ /* This blocks until the alarm ring causes an interrupt */
+ ret = read(fd_rtc, &data, sizeof(unsigned long));
+ if (ret < 0)
+ perror("Failed to get RTC alarm.");
+ else
+ puts("Alarm rang.\n");
+
+out_reset:
+ ioctl(fd_rtc, RTC_AIE_OFF, 0);
+
+ /* Set ACTIVE state for PMIC_A */
+ ioctl(fd_pfsm[0], PMIC_SET_ACTIVE_STATE, 0);
+
+out:
+ for (i = 0 ; i < PMIC_NB ; i++)
+ if (fd_pfsm[i])
+ close(fd_pfsm[i]);
+
+ if (fd_rtc)
+ close(fd_rtc);
+
+ return 0;
+}
diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore
new file mode 100644
index 000000000000..d4cfa3176b1b
--- /dev/null
+++ b/samples/pidfd/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/pidfd-metadata
diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile
new file mode 100644
index 000000000000..9754e2d81f70
--- /dev/null
+++ b/samples/pidfd/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+usertprogs-always-y += pidfd-metadata
+
+userccflags += -I usr/include
diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c
new file mode 100644
index 000000000000..c459155daf9a
--- /dev/null
+++ b/samples/pidfd/pidfd-metadata.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#ifndef CLONE_PIDFD
+#define CLONE_PIDFD 0x00001000
+#endif
+
+#ifndef __NR_pidfd_send_signal
+#define __NR_pidfd_send_signal -1
+#endif
+
+static int do_child(void *args)
+{
+ printf("%d\n", getpid());
+ _exit(EXIT_SUCCESS);
+}
+
+static pid_t pidfd_clone(int flags, int *pidfd)
+{
+ size_t stack_size = 1024;
+ char *stack[1024] = { 0 };
+
+#ifdef __ia64__
+ return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
+#else
+ return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
+#endif
+}
+
+static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
+ unsigned int flags)
+{
+ return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
+}
+
+static int pidfd_metadata_fd(pid_t pid, int pidfd)
+{
+ int procfd, ret;
+ char path[100];
+
+ snprintf(path, sizeof(path), "/proc/%d", pid);
+ procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (procfd < 0) {
+ warn("Failed to open %s\n", path);
+ return -1;
+ }
+
+ /*
+ * Verify that the pid has not been recycled and our /proc/<pid> handle
+ * is still valid.
+ */
+ ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
+ if (ret < 0) {
+ switch (errno) {
+ case EPERM:
+ /* Process exists, just not allowed to signal it. */
+ break;
+ default:
+ warn("Failed to signal process\n");
+ close(procfd);
+ procfd = -1;
+ }
+ }
+
+ return procfd;
+}
+
+int main(int argc, char *argv[])
+{
+ int pidfd = -1, ret = EXIT_FAILURE;
+ char buf[4096] = { 0 };
+ pid_t pid;
+ int procfd, statusfd;
+ ssize_t bytes;
+
+ pid = pidfd_clone(CLONE_PIDFD, &pidfd);
+ if (pid < 0)
+ err(ret, "CLONE_PIDFD");
+ if (pidfd == -1) {
+ warnx("CLONE_PIDFD is not supported by the kernel");
+ goto out;
+ }
+
+ procfd = pidfd_metadata_fd(pid, pidfd);
+ close(pidfd);
+ if (procfd < 0)
+ goto out;
+
+ statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
+ close(procfd);
+ if (statusfd < 0)
+ goto out;
+
+ bytes = read(statusfd, buf, sizeof(buf));
+ if (bytes > 0)
+ bytes = write(STDOUT_FILENO, buf, bytes);
+ close(statusfd);
+ ret = EXIT_SUCCESS;
+
+out:
+ (void)wait(NULL);
+
+ exit(ret);
+}
diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst
new file mode 100644
index 000000000000..f4adeed5f5f0
--- /dev/null
+++ b/samples/pktgen/README.rst
@@ -0,0 +1,64 @@
+Sample and benchmark scripts for pktgen (packet generator)
+==========================================================
+This directory contains some pktgen sample and benchmark scripts, that
+can easily be copied and adjusted for your own use-case.
+
+General doc is located in kernel: Documentation/networking/pktgen.rst
+
+Helper include files
+====================
+This directory contains two helper shell files, that can be "included"
+by shell source'ing. Namely "functions.sh" and "parameters.sh".
+
+Common parameters
+-----------------
+The parameters.sh file support easy and consistant parameter parsing
+across the sample scripts. Usage example is printed on errors::
+
+ Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX
+ -i : ($DEV) output interface/device (required)
+ -s : ($PKT_SIZE) packet size
+ -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed
+ -m : ($DST_MAC) destination MAC-addr
+ -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed
+ -t : ($THREADS) threads to start
+ -f : ($F_THREAD) index of first thread (zero indexed CPU number)
+ -c : ($SKB_CLONE) SKB clones send before alloc new SKB
+ -n : ($COUNT) num messages to send per thread, 0 means indefinitely
+ -b : ($BURST) HW level bursting of SKBs
+ -v : ($VERBOSE) verbose
+ -x : ($DEBUG) debug
+ -6 : ($IP6) IPv6
+ -w : ($DELAY) Tx Delay value (ns)
+ -a : ($APPEND) Script will not reset generator's state, but will append its config
+
+The global variable being set is also listed. E.g. the required
+interface/device parameter "-i" sets variable $DEV.
+
+"-a" parameter may be used to create different flows simultaneously.
+In this mode script will keep the existing config, will append its settings.
+In this mode you'll have to manually run traffic with "pg_ctrl start".
+
+For example you may use:
+
+ source ./samples/pktgen/functions.sh
+ pg_ctrl reset
+ # add first device
+ ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f0 -m 34:80:0d:a3:fc:c9 -t 8
+ # add second device
+ ./pktgen_sample06_numa_awared_queue_irq_affinity.sh -a -i ens1f1 -m 34:80:0d:a3:fc:c9 -t 8
+ # run joint traffic on two devs
+ pg_ctrl start
+
+Common functions
+----------------
+The functions.sh file provides; Three different shell functions for
+configuring the different components of pktgen: pg_ctrl(), pg_thread()
+and pg_set().
+
+These functions correspond to pktgens different components.
+ * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
+ * pg_thread() control the kernel threads and binding to devices
+ * pg_set() control setup of individual devices
+
+See sample scripts for usage examples.
diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh
new file mode 100644
index 000000000000..c08cefb8eb1f
--- /dev/null
+++ b/samples/pktgen/functions.sh
@@ -0,0 +1,340 @@
+#
+# Common functions used by pktgen scripts
+# - Depending on bash 3 (or higher) syntax
+#
+# Author: Jesper Dangaaard Brouer
+# License: GPL
+
+set -o errexit
+
+## -- General shell logging cmds --
+function err() {
+ local exitcode=$1
+ shift
+ echo "ERROR: $@" >&2
+ exit $exitcode
+}
+
+function warn() {
+ echo "WARN : $@" >&2
+}
+
+function info() {
+ if [[ -n "$VERBOSE" ]]; then
+ echo "INFO : $@" >&2
+ fi
+}
+
+## -- Pktgen proc config commands -- ##
+export PROC_DIR=/proc/net/pktgen
+#
+# Three different shell functions for configuring the different
+# components of pktgen:
+# pg_ctrl(), pg_thread() and pg_set().
+#
+# These functions correspond to pktgens different components.
+# * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
+# * pg_thread() control the kernel threads and binding to devices
+# * pg_set() control setup of individual devices
+function pg_ctrl() {
+ local proc_file="pgctrl"
+ proc_cmd ${proc_file} "$@"
+}
+
+function pg_thread() {
+ local thread=$1
+ local proc_file="kpktgend_${thread}"
+ shift
+ proc_cmd ${proc_file} "$@"
+}
+
+function pg_set() {
+ local dev=$1
+ local proc_file="$dev"
+ shift
+ proc_cmd ${proc_file} "$@"
+}
+
+# More generic replacement for pgset(), that does not depend on global
+# variable for proc file.
+function proc_cmd() {
+ local result
+ local proc_file=$1
+ local status=0
+ # after shift, the remaining args are contained in $@
+ shift
+ local proc_ctrl=${PROC_DIR}/$proc_file
+ if [[ ! -e "$proc_ctrl" ]]; then
+ err 3 "proc file:$proc_ctrl does not exists (dev added to thread?)"
+ else
+ if [[ ! -w "$proc_ctrl" ]]; then
+ err 4 "proc file:$proc_ctrl not writable, not root?!"
+ fi
+ fi
+
+ if [[ "$DEBUG" == "yes" ]]; then
+ echo "cmd: $@ > $proc_ctrl"
+ fi
+ # Quoting of "$@" is important for space expansion
+ echo "$@" > "$proc_ctrl" || status=$?
+
+ if [[ "$proc_file" != "pgctrl" ]]; then
+ result=$(grep "Result: OK:" $proc_ctrl) || true
+ if [[ "$result" == "" ]]; then
+ grep "Result:" $proc_ctrl >&2
+ fi
+ fi
+ if (( $status != 0 )); then
+ err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\""
+ fi
+}
+
+# Old obsolete "pgset" function, with slightly improved err handling
+function pgset() {
+ local result
+
+ if [[ "$DEBUG" == "yes" ]]; then
+ echo "cmd: $1 > $PGDEV"
+ fi
+ echo $1 > $PGDEV
+ local status=$?
+
+ result=`cat $PGDEV | fgrep "Result: OK:"`
+ if [[ "$result" == "" ]]; then
+ cat $PGDEV | fgrep Result:
+ fi
+ if (( $status != 0 )); then
+ err 5 "Write error($status) occurred cmd: \"$1 > $PGDEV\""
+ fi
+}
+
+function trap_exit()
+{
+ # Cleanup pktgen setup on exit if thats not "append mode"
+ if [[ -z "$APPEND" ]] && [[ $EUID -eq 0 ]]; then
+ trap 'pg_ctrl "reset"' EXIT
+ fi
+}
+
+## -- General shell tricks --
+
+function root_check_run_with_sudo() {
+ # Trick so, program can be run as normal user, will just use "sudo"
+ # call as root_check_run_as_sudo "$@"
+ if [ "$EUID" -ne 0 ]; then
+ if [ -x $0 ]; then # Directly executable use sudo
+ info "Not root, running with sudo"
+ sudo -E "$0" "$@"
+ exit $?
+ fi
+ err 4 "cannot perform sudo run of $0"
+ fi
+}
+
+# Exact input device's NUMA node info
+function get_iface_node()
+{
+ local node=$(</sys/class/net/$1/device/numa_node)
+ if [[ $node == -1 ]]; then
+ echo 0
+ else
+ echo $node
+ fi
+}
+
+# Given an Dev/iface, get its queues' irq numbers
+function get_iface_irqs()
+{
+ local IFACE=$1
+ local queues="${IFACE}-.*TxRx"
+
+ irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\
+ do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\
+ done)
+ [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE"
+
+ echo $irqs
+}
+
+# Given a NUMA node, return cpu ids belonging to it.
+function get_node_cpus()
+{
+ local node=$1
+ local node_cpu_list
+ local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \
+ /sys/devices/system/node/node$node/cpulist`
+
+ for cpu_range in $node_cpu_range_list
+ do
+ node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }`
+ done
+
+ echo $node_cpu_list
+}
+
+# Check $1 is in between $2, $3 ($2 <= $1 <= $3)
+function in_between() { [[ ($1 -ge $2) && ($1 -le $3) ]] ; }
+
+# Extend shrunken IPv6 address.
+# fe80::42:bcff:fe84:e10a => fe80:0:0:0:42:bcff:fe84:e10a
+function extend_addr6()
+{
+ local addr=$1
+ local sep=: sep2=::
+ local sep_cnt=$(tr -cd $sep <<< $1 | wc -c)
+ local shrink
+
+ # separator count should be (2 <= $sep_cnt <= 7)
+ if ! (in_between $sep_cnt 2 7); then
+ err 5 "Invalid IP6 address: $1"
+ fi
+
+ # if shrink '::' occurs multiple, it's malformed.
+ shrink=( $(grep -E -o "$sep{2,}" <<< $addr) )
+ if [[ ${#shrink[@]} -ne 0 ]]; then
+ if [[ ${#shrink[@]} -gt 1 || ( ${shrink[0]} != $sep2 ) ]]; then
+ err 5 "Invalid IP6 address: $1"
+ fi
+ fi
+
+ # add 0 at begin & end, and extend addr by adding :0
+ [[ ${addr:0:1} == $sep ]] && addr=0${addr}
+ [[ ${addr: -1} == $sep ]] && addr=${addr}0
+ echo "${addr/$sep2/$(printf ':0%.s' $(seq $[8-sep_cnt])):}"
+}
+
+# Given a single IP(v4/v6) address, whether it is valid.
+function validate_addr()
+{
+ # check function is called with (funcname)6
+ [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6
+ local bitlen=$[ IP6 ? 128 : 32 ]
+ local len=$[ IP6 ? 8 : 4 ]
+ local max=$[ 2**(len*2)-1 ]
+ local net prefix
+ local addr sep
+
+ IFS='/' read net prefix <<< $1
+ [[ $IP6 ]] && net=$(extend_addr6 $net)
+
+ # if prefix exists, check (0 <= $prefix <= $bitlen)
+ if [[ -n $prefix ]]; then
+ if ! (in_between $prefix 0 $bitlen); then
+ err 5 "Invalid prefix: /$prefix"
+ fi
+ fi
+
+ # set separator for each IP(v4/v6)
+ [[ $IP6 ]] && sep=: || sep=.
+ IFS=$sep read -a addr <<< $net
+
+ # array length
+ if [[ ${#addr[@]} != $len ]]; then
+ err 5 "Invalid IP$IP6 address: $1"
+ fi
+
+ # check each digit (0 <= $digit <= $max)
+ for digit in "${addr[@]}"; do
+ [[ $IP6 ]] && digit=$[ 16#$digit ]
+ if ! (in_between $digit 0 $max); then
+ err 5 "Invalid IP$IP6 address: $1"
+ fi
+ done
+
+ return 0
+}
+
+function validate_addr6() { validate_addr $@ ; }
+
+# Given a single IP(v4/v6) or CIDR, return minimum and maximum IP addr.
+function parse_addr()
+{
+ # check function is called with (funcname)6
+ [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6
+ local net prefix
+ local min_ip max_ip
+
+ IFS='/' read net prefix <<< $1
+ [[ $IP6 ]] && net=$(extend_addr6 $net)
+
+ if [[ -z $prefix ]]; then
+ min_ip=$net
+ max_ip=$net
+ else
+ # defining array for converting Decimal 2 Binary
+ # 00000000 00000001 00000010 00000011 00000100 ...
+ local d2b='{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}'
+ [[ $IP6 ]] && d2b+=$d2b
+ eval local D2B=($d2b)
+
+ local bitlen=$[ IP6 ? 128 : 32 ]
+ local remain=$[ bitlen-prefix ]
+ local octet=$[ IP6 ? 16 : 8 ]
+ local min_mask max_mask
+ local min max
+ local ip_bit
+ local ip sep
+
+ # set separator for each IP(v4/v6)
+ [[ $IP6 ]] && sep=: || sep=.
+ IFS=$sep read -ra ip <<< $net
+
+ min_mask="$(printf '1%.s' $(seq $prefix))$(printf '0%.s' $(seq $remain))"
+ max_mask="$(printf '0%.s' $(seq $prefix))$(printf '1%.s' $(seq $remain))"
+
+ # calculate min/max ip with &,| operator
+ for i in "${!ip[@]}"; do
+ digit=$[ IP6 ? 16#${ip[$i]} : ${ip[$i]} ]
+ ip_bit=${D2B[$digit]}
+
+ idx=$[ octet*i ]
+ min[$i]=$[ 2#$ip_bit & 2#${min_mask:$idx:$octet} ]
+ max[$i]=$[ 2#$ip_bit | 2#${max_mask:$idx:$octet} ]
+ [[ $IP6 ]] && { min[$i]=$(printf '%X' ${min[$i]});
+ max[$i]=$(printf '%X' ${max[$i]}); }
+ done
+
+ min_ip=$(IFS=$sep; echo "${min[*]}")
+ max_ip=$(IFS=$sep; echo "${max[*]}")
+ fi
+
+ echo $min_ip $max_ip
+}
+
+function parse_addr6() { parse_addr $@ ; }
+
+# Given a single or range of port(s), return minimum and maximum port number.
+function parse_ports()
+{
+ local port_str=$1
+ local port_list
+ local min_port
+ local max_port
+
+ IFS="-" read -ra port_list <<< $port_str
+
+ min_port=${port_list[0]}
+ max_port=${port_list[1]:-$min_port}
+
+ echo $min_port $max_port
+}
+
+# Given a minimum and maximum port, verify port number.
+function validate_ports()
+{
+ local min_port=$1
+ local max_port=$2
+
+ # 1 <= port <= 65535
+ if (in_between $min_port 1 65535); then
+ if (in_between $max_port 1 65535); then
+ if [[ $min_port -le $max_port ]]; then
+ return 0
+ fi
+ fi
+ fi
+
+ err 5 "Invalid port(s): $min_port-$max_port"
+}
diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh
new file mode 100644
index 000000000000..81906f199454
--- /dev/null
+++ b/samples/pktgen/parameters.sh
@@ -0,0 +1,139 @@
+#
+# SPDX-License-Identifier: GPL-2.0
+# Common parameter parsing for pktgen scripts
+#
+
+function usage() {
+ echo ""
+ echo "Usage: $0 [-vx] -i ethX"
+ echo " -i : (\$DEV) output interface/device (required)"
+ echo " -s : (\$PKT_SIZE) packet size"
+ echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed"
+ echo " -m : (\$DST_MAC) destination MAC-addr"
+ echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed"
+ echo " -k : (\$UDP_CSUM) enable UDP tx checksum"
+ echo " -t : (\$THREADS) threads to start"
+ echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)"
+ echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB"
+ echo " -n : (\$COUNT) num messages to send per thread, 0 means indefinitely"
+ echo " -b : (\$BURST) HW level bursting of SKBs"
+ echo " -v : (\$VERBOSE) verbose"
+ echo " -x : (\$DEBUG) debug"
+ echo " -6 : (\$IP6) IPv6"
+ echo " -w : (\$DELAY) Tx Delay value (ns)"
+ echo " -a : (\$APPEND) Script will not reset generator's state, but will append its config"
+ echo ""
+}
+
+## --- Parse command line arguments / parameters ---
+## echo "Commandline options:"
+while getopts "s:i:d:m:p:f:t:c:n:b:w:vxh6ak" option; do
+ case $option in
+ i) # interface
+ export DEV=$OPTARG
+ info "Output device set to: DEV=$DEV"
+ ;;
+ s)
+ export PKT_SIZE=$OPTARG
+ info "Packet size set to: PKT_SIZE=$PKT_SIZE bytes"
+ ;;
+ d) # destination IP
+ export DEST_IP=$OPTARG
+ info "Destination IP set to: DEST_IP=$DEST_IP"
+ ;;
+ m) # MAC
+ export DST_MAC=$OPTARG
+ info "Destination MAC set to: DST_MAC=$DST_MAC"
+ ;;
+ p) # PORT
+ export DST_PORT=$OPTARG
+ info "Destination PORT set to: DST_PORT=$DST_PORT"
+ ;;
+ f)
+ export F_THREAD=$OPTARG
+ info "Index of first thread (zero indexed CPU number): $F_THREAD"
+ ;;
+ t)
+ export THREADS=$OPTARG
+ info "Number of threads to start: $THREADS"
+ ;;
+ c)
+ export CLONE_SKB=$OPTARG
+ info "CLONE_SKB=$CLONE_SKB"
+ ;;
+ n)
+ export COUNT=$OPTARG
+ info "COUNT=$COUNT"
+ ;;
+ b)
+ export BURST=$OPTARG
+ info "SKB bursting: BURST=$BURST"
+ ;;
+ w)
+ export DELAY=$OPTARG
+ info "DELAY=$DELAY"
+ ;;
+ v)
+ export VERBOSE=yes
+ info "Verbose mode: VERBOSE=$VERBOSE"
+ ;;
+ x)
+ export DEBUG=yes
+ info "Debug mode: DEBUG=$DEBUG"
+ ;;
+ 6)
+ export IP6=6
+ info "IP6: IP6=$IP6"
+ ;;
+ a)
+ export APPEND=yes
+ info "Append mode: APPEND=$APPEND"
+ ;;
+ k)
+ export UDP_CSUM=yes
+ info "UDP tx checksum: UDP_CSUM=$UDP_CSUM"
+ ;;
+ h|?|*)
+ usage;
+ err 2 "[ERROR] Unknown parameters!!!"
+ esac
+done
+shift $(( $OPTIND - 1 ))
+
+if [ -z "$PKT_SIZE" ]; then
+ # NIC adds 4 bytes CRC
+ export PKT_SIZE=60
+ info "Default packet size set to: set to: $PKT_SIZE bytes"
+fi
+
+if [ -z "$F_THREAD" ]; then
+ # First thread (F_THREAD) reference the zero indexed CPU number
+ export F_THREAD=0
+fi
+
+if [ -z "$THREADS" ]; then
+ export THREADS=1
+fi
+
+# default DELAY
+[ -z "$DELAY" ] && export DELAY=0 # Zero means max speed
+
+export L_THREAD=$(( THREADS + F_THREAD - 1 ))
+
+if [ -z "$DEV" ]; then
+ usage
+ err 2 "Please specify output device"
+fi
+
+if [ -z "$DST_MAC" ]; then
+ warn "Missing destination MAC address"
+fi
+
+if [ -z "$DEST_IP" ]; then
+ warn "Missing destination IP address"
+fi
+
+if [ ! -d /proc/net/pktgen ]; then
+ info "Loading kernel module: pktgen"
+ modprobe pktgen
+fi
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
new file mode 100755
index 000000000000..b4328db4a164
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Benchmark script:
+# - developed for benchmarking ingress qdisc path
+#
+# Script for injecting packets into RX path of the stack with pktgen
+# "xmit_mode netif_receive". With an invalid dst_mac this will only
+# measure the ingress code path as packets gets dropped in ip_rcv().
+#
+# This script don't really need any hardware. It benchmarks software
+# RX path just after NIC driver level. With bursting is also
+# "removes" the SKB alloc/free overhead.
+#
+# Setup scenarios for measuring ingress qdisc (with invalid dst_mac):
+# ------------------------------------------------------------------
+# (1) no ingress (uses static_key_false(&ingress_needed))
+#
+# (2) ingress on other dev (change ingress_needed and calls
+# handle_ing() but exit early)
+#
+# config: tc qdisc add dev $SOMEDEV handle ffff: ingress
+#
+# (3) ingress on this dev, handle_ing() -> tc_classify()
+#
+# config: tc qdisc add dev $DEV handle ffff: ingress
+#
+# (4) ingress on this dev + drop at u32 classifier/action.
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+# Using invalid DST_MAC will cause the packets to get dropped in
+# ip_rcv() which is part of the test
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$BURST" ] && BURST=1024
+[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config of dev
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ # Inject packet into RX path of stack
+ pg_set $dev "xmit_mode netif_receive"
+
+ # Burst allow us to avoid measuring SKB alloc/free overhead
+ pg_set $dev "burst $BURST"
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+print_result
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
new file mode 100755
index 000000000000..f2beb512c5cd
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Benchmark script:
+# - developed for benchmarking egress qdisc path, derived (more
+# like cut'n'pasted) from ingress benchmark script.
+#
+# Script for injecting packets into egress qdisc path of the stack
+# with pktgen "xmit_mode queue_xmit".
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+
+# Burst greater than 1 are invalid for queue_xmit mode
+if [[ -n "$BURST" ]]; then
+ err 1 "Bursting not supported for this mode"
+fi
+[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config of dev
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ # Inject packet into TX qdisc egress path of stack
+ pg_set $dev "xmit_mode queue_xmit"
+done
+
+# Run if user hits control-c
+function print_result {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+print_result
diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh
new file mode 100755
index 000000000000..66cb707479e6
--- /dev/null
+++ b/samples/pktgen/pktgen_sample01_simple.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Simple example:
+# * pktgen sending with single thread and single interface
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+# - go look in parameters.sh to see which setting are avail
+# - required param is the interface "-i" stored in $DEV
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+#
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+# Example enforce param "-m" for dst_mac
+[ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac"
+[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# Flow variation random source port between min and max
+UDP_SRC_MIN=9
+UDP_SRC_MAX=109
+
+# General cleanup everything since last run
+# (especially important if other threads were configured by other scripts)
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Add remove all other devices and add_device $DEV to thread 0
+thread=0
+[ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+pg_thread $thread "add_device" $DEV
+
+# How many packets to send (zero means indefinitely)
+pg_set $DEV "count $COUNT"
+
+# Reduce alloc cost by sending same SKB many times
+# - this obviously affects the randomness within the packet
+pg_set $DEV "clone_skb $CLONE_SKB"
+
+# Set packet size
+pg_set $DEV "pkt_size $PKT_SIZE"
+
+# Delay between packets (zero means max speed)
+pg_set $DEV "delay $DELAY"
+
+# Flag example disabling timestamping
+pg_set $DEV "flag NO_TIMESTAMP"
+
+# Destination
+pg_set $DEV "dst_mac $DST_MAC"
+pg_set $DEV "dst${IP6}_min $DST_MIN"
+pg_set $DEV "dst${IP6}_max $DST_MAX"
+
+if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $DEV "flag UDPDST_RND"
+ pg_set $DEV "udp_dst_min $UDP_DST_MIN"
+ pg_set $DEV "udp_dst_max $UDP_DST_MAX"
+fi
+
+[ ! -z "$UDP_CSUM" ] && pg_set $DEV "flag UDPCSUM"
+
+# Setup random UDP port src range
+pg_set $DEV "flag UDPSRC_RND"
+pg_set $DEV "udp_src_min $UDP_SRC_MIN"
+pg_set $DEV "udp_src_max $UDP_SRC_MAX"
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ echo "Result device: $DEV"
+ cat /proc/net/pktgen/$DEV
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+if [ -z "$APPEND" ]; then
+ # start_run
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+ echo "Done" >&2
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi \ No newline at end of file
diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh
new file mode 100755
index 000000000000..93f33d7d0a81
--- /dev/null
+++ b/samples/pktgen/pktgen_sample02_multiqueue.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Multiqueue: Using pktgen threads for sending on multiple CPUs
+# * adding devices to kernel threads
+# * notice the naming scheme for keeping device names unique
+# * nameing scheme: dev@thread_number
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+#
+# Required param: -i dev in $DEV
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
+
+# Base Config
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# Flow variation random source port between min and max
+UDP_SRC_MIN=9
+UDP_SRC_MAX=109
+
+# (example of setting default params in your script)
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ [ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Notice config queue to map to cpu (mirrors smp_processor_id())
+ # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
+ pg_set $dev "flag QUEUE_MAP_CPU"
+
+ # Base config of dev
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+
+ # Flag example disabling timestamping
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+
+ # Setup random UDP port src range
+ pg_set $dev "flag UDPSRC_RND"
+ pg_set $dev "udp_src_min $UDP_SRC_MIN"
+ pg_set $dev "udp_src_max $UDP_SRC_MAX"
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+if [ -z "$APPEND" ]; then
+ # start_run
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+ echo "Done" >&2
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi
diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
new file mode 100755
index 000000000000..8f8ed1ac46a0
--- /dev/null
+++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script for max single flow performance
+# - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2]
+#
+# Using pktgen "burst" option (use -b $N)
+# - To boost max performance
+# - Avail since: kernel v3.18
+# * commit 38b2cf2982dc73 ("net: pktgen: packet bursting via skb->xmit_more")
+# - This avoids writing the HW tailptr on every driver xmit
+# - The performance boost is impressive, see commit and blog [2]
+#
+# Notice: On purpose generates a single (UDP) flow towards target,
+# reason behind this is to only overload/activate a single CPU on
+# target host. And no randomness for pktgen also makes it faster.
+#
+# Tuning see:
+# [1] http://netoptimizer.blogspot.dk/2014/06/pktgen-for-network-overload-testing.html
+# [2] http://netoptimizer.blogspot.dk/2014/10/unlocked-10gbps-tx-wirespeed-smallest.html
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$BURST" ] && BURST=32
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ [ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+
+ # Setup burst, for easy testing -b 0 disable bursting
+ # (internally in pktgen default and minimum burst=1)
+ if [[ ${BURST} -ne 0 ]]; then
+ pg_set $dev "burst $BURST"
+ else
+ info "$dev: Not using burst"
+ fi
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+if [ -z "$APPEND" ]; then
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi
diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh
new file mode 100755
index 000000000000..65ed486ce4f1
--- /dev/null
+++ b/samples/pktgen/pktgen_sample04_many_flows.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script example for many flows testing
+#
+# Number of simultaneous flows limited by variable $FLOWS
+# and number of packets per flow controlled by variable $FLOWLEN
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# NOTICE: Script specific settings
+# =======
+# Limiting the number of concurrent flows ($FLOWS)
+# and also set how many packets each flow contains ($FLOWLEN)
+#
+[ -z "$FLOWS" ] && FLOWS="8000"
+[ -z "$FLOWLEN" ] && FLOWLEN="10"
+
+if [[ -n "$BURST" ]]; then
+ err 1 "Bursting not supported for this mode"
+fi
+
+# 198.18.0.0 / 198.19.255.255
+read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15)
+
+# General cleanup everything since last run
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ [ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Single destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+
+ # Randomize source IP-addresses
+ pg_set $dev "flag IPSRC_RND"
+ pg_set $dev "src_min $SRC_MIN"
+ pg_set $dev "src_max $SRC_MAX"
+
+ # Limit number of flows (max 65535)
+ pg_set $dev "flows $FLOWS"
+ #
+ # How many packets a flow will send, before flow "entry" is
+ # re-generated/setup.
+ pg_set $dev "flowlen $FLOWLEN"
+ #
+ # Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow
+ # being send back-to-back, before next flow is selected
+ # incrementally. This helps lookup caches, and is more realistic.
+ #
+ pg_set $dev "flag FLOW_SEQ"
+
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+if [ -z "$APPEND" ]; then
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi
diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
new file mode 100755
index 000000000000..bcbc386b2284
--- /dev/null
+++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Script will generate one flow per thread (-t N)
+# - Same destination IP
+# - Fake source IPs for each flow (fixed based on thread number)
+#
+# Useful for scale testing on receiver, to see whether silo'ing flows
+# works and scales. For optimal scalability (on receiver) each
+# separate-flow should not access shared variables/data. This script
+# helps magnify any of these scaling issues by overloading the receiver.
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+# Set some default params, if they didn't get set
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+[ -z "$BURST" ] && BURST=32
+[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+
+ # Add remove all other devices and add_device $dev to thread
+ [ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # Base config
+ pg_set $dev "flag QUEUE_MAP_CPU"
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Single destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+
+ # Setup source IP-addresses based on thread number
+ pg_set $dev "src_min 198.18.$((thread+1)).1"
+ pg_set $dev "src_max 198.18.$((thread+1)).1"
+
+ # Setup burst, for easy testing -b 0 disable bursting
+ # (internally in pktgen default and minimum burst=1)
+ if [[ ${BURST} -ne 0 ]]; then
+ pg_set $dev "burst $BURST"
+ else
+ info "$dev: Not using burst"
+ fi
+
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+if [ -z "$APPEND" ]; then
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi
diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
new file mode 100755
index 000000000000..0c5409cb5bab
--- /dev/null
+++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#
+# Multiqueue: Using pktgen threads for sending on multiple CPUs
+# * adding devices to kernel threads which are in the same NUMA node
+# * bound devices queue's irq affinity to the threads, 1:1 mapping
+# * notice the naming scheme for keeping device names unique
+# * nameing scheme: dev@thread_number
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+#
+# Required param: -i dev in $DEV
+source ${basedir}/parameters.sh
+
+# Trap EXIT first
+trap_exit
+
+# Base Config
+[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# Flow variation random source port between min and max
+UDP_SRC_MIN=9
+UDP_SRC_MAX=109
+
+node=`get_iface_node $DEV`
+irq_array=(`get_iface_irqs $DEV`)
+cpu_array=(`get_node_cpus $node`)
+
+[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \
+ err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})"
+
+# (example of setting default params in your script)
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+if [ -n "$DEST_IP" ]; then
+ validate_addr${IP6} $DEST_IP
+ read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
+fi
+if [ -n "$DST_PORT" ]; then
+ read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
+ validate_ports $UDP_DST_MIN $UDP_DST_MAX
+fi
+
+# General cleanup everything since last run
+[ -z "$APPEND" ] && pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((i = 0; i < $THREADS; i++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ # Set the queue's irq affinity to this $thread (processor)
+ # if '-f' is designated, offset cpu id
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list
+ info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`"
+
+ # Add remove all other devices and add_device $dev to thread
+ [ -z "$APPEND" ] && pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # select queue and bind the queue and $dev in 1:1 relationship
+ queue_num=$i
+ info "queue number is $queue_num"
+ pg_set $dev "queue_map_min $queue_num"
+ pg_set $dev "queue_map_max $queue_num"
+
+ # Notice config queue to map to cpu (mirrors smp_processor_id())
+ # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
+ pg_set $dev "flag QUEUE_MAP_CPU"
+
+ # Base config of dev
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+
+ # Flag example disabling timestamping
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst${IP6}_min $DST_MIN"
+ pg_set $dev "dst${IP6}_max $DST_MAX"
+
+ if [ -n "$DST_PORT" ]; then
+ # Single destination port or random port range
+ pg_set $dev "flag UDPDST_RND"
+ pg_set $dev "udp_dst_min $UDP_DST_MIN"
+ pg_set $dev "udp_dst_max $UDP_DST_MAX"
+ fi
+
+ [ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+
+ # Setup random UDP port src range
+ pg_set $dev "flag UDPSRC_RND"
+ pg_set $dev "udp_src_min $UDP_SRC_MIN"
+ pg_set $dev "udp_src_max $UDP_SRC_MAX"
+done
+
+# Run if user hits control-c
+function print_result() {
+ # Print results
+ for ((i = 0; i < $THREADS; i++)); do
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+ done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+# start_run
+if [ -z "$APPEND" ]; then
+ echo "Running... ctrl^C to stop" >&2
+ pg_ctrl "start"
+ echo "Done" >&2
+
+ print_result
+else
+ echo "Append mode: config done. Do more or use 'pg_ctrl start' to run"
+fi
diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile
new file mode 100644
index 000000000000..641943d40c4a
--- /dev/null
+++ b/samples/qmi/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o
diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c
new file mode 100644
index 000000000000..d1814582319b
--- /dev/null
+++ b/samples/qmi/qmi_sample_client.c
@@ -0,0 +1,620 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Sample in-kernel QMI client driver
+ *
+ * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved.
+ * Copyright (C) 2017 Linaro Ltd.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/qrtr.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <net/sock.h>
+#include <linux/soc/qcom/qmi.h>
+
+#define PING_REQ1_TLV_TYPE 0x1
+#define PING_RESP1_TLV_TYPE 0x2
+#define PING_OPT1_TLV_TYPE 0x10
+#define PING_OPT2_TLV_TYPE 0x11
+
+#define DATA_REQ1_TLV_TYPE 0x1
+#define DATA_RESP1_TLV_TYPE 0x2
+#define DATA_OPT1_TLV_TYPE 0x10
+#define DATA_OPT2_TLV_TYPE 0x11
+
+#define TEST_MED_DATA_SIZE_V01 8192
+#define TEST_MAX_NAME_SIZE_V01 255
+
+#define TEST_PING_REQ_MSG_ID_V01 0x20
+#define TEST_DATA_REQ_MSG_ID_V01 0x21
+
+#define TEST_PING_REQ_MAX_MSG_LEN_V01 266
+#define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456
+
+struct test_name_type_v01 {
+ u32 name_len;
+ char name[TEST_MAX_NAME_SIZE_V01];
+};
+
+static const struct qmi_elem_info test_name_type_v01_ei[] = {
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = QMI_COMMON_TLV_TYPE,
+ .offset = offsetof(struct test_name_type_v01,
+ name_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MAX_NAME_SIZE_V01,
+ .elem_size = sizeof(char),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = QMI_COMMON_TLV_TYPE,
+ .offset = offsetof(struct test_name_type_v01,
+ name),
+ },
+ {}
+};
+
+struct test_ping_req_msg_v01 {
+ char ping[4];
+
+ u8 client_name_valid;
+ struct test_name_type_v01 client_name;
+};
+
+static const struct qmi_elem_info test_ping_req_msg_v01_ei[] = {
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = 4,
+ .elem_size = sizeof(char),
+ .array_type = STATIC_ARRAY,
+ .tlv_type = PING_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ ping),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ client_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_req_msg_v01,
+ client_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_ping_resp_msg_v01 {
+ struct qmi_response_type_v01 resp;
+
+ u8 pong_valid;
+ char pong[4];
+
+ u8 service_name_valid;
+ struct test_name_type_v01 service_name;
+};
+
+static const struct qmi_elem_info test_ping_resp_msg_v01_ei[] = {
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct qmi_response_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_RESP1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ resp),
+ .ei_array = qmi_response_type_v01_ei,
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ pong_valid),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = 4,
+ .elem_size = sizeof(char),
+ .array_type = STATIC_ARRAY,
+ .tlv_type = PING_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ pong),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ service_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = PING_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_ping_resp_msg_v01,
+ service_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_data_req_msg_v01 {
+ u32 data_len;
+ u8 data[TEST_MED_DATA_SIZE_V01];
+
+ u8 client_name_valid;
+ struct test_name_type_v01 client_name;
+};
+
+static const struct qmi_elem_info test_data_req_msg_v01_ei[] = {
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u32),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ data_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MED_DATA_SIZE_V01,
+ .elem_size = sizeof(u8),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = DATA_REQ1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ data),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ client_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_req_msg_v01,
+ client_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+struct test_data_resp_msg_v01 {
+ struct qmi_response_type_v01 resp;
+
+ u8 data_valid;
+ u32 data_len;
+ u8 data[TEST_MED_DATA_SIZE_V01];
+
+ u8 service_name_valid;
+ struct test_name_type_v01 service_name;
+};
+
+static const struct qmi_elem_info test_data_resp_msg_v01_ei[] = {
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct qmi_response_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_RESP1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ resp),
+ .ei_array = qmi_response_type_v01_ei,
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data_valid),
+ },
+ {
+ .data_type = QMI_DATA_LEN,
+ .elem_len = 1,
+ .elem_size = sizeof(u32),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data_len),
+ },
+ {
+ .data_type = QMI_UNSIGNED_1_BYTE,
+ .elem_len = TEST_MED_DATA_SIZE_V01,
+ .elem_size = sizeof(u8),
+ .array_type = VAR_LEN_ARRAY,
+ .tlv_type = DATA_OPT1_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ data),
+ },
+ {
+ .data_type = QMI_OPT_FLAG,
+ .elem_len = 1,
+ .elem_size = sizeof(u8),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ service_name_valid),
+ },
+ {
+ .data_type = QMI_STRUCT,
+ .elem_len = 1,
+ .elem_size = sizeof(struct test_name_type_v01),
+ .array_type = NO_ARRAY,
+ .tlv_type = DATA_OPT2_TLV_TYPE,
+ .offset = offsetof(struct test_data_resp_msg_v01,
+ service_name),
+ .ei_array = test_name_type_v01_ei,
+ },
+ {}
+};
+
+/*
+ * ping_write() - ping_pong debugfs file write handler
+ * @file: debugfs file context
+ * @user_buf: reference to the user data (ignored)
+ * @count: number of bytes in @user_buf
+ * @ppos: offset in @file to write
+ *
+ * This function allows user space to send out a ping_pong QMI encoded message
+ * to the associated remote test service and will return with the result of the
+ * transaction. It serves as an example of how to provide a custom response
+ * handler.
+ *
+ * Return: @count, or negative errno on failure.
+ */
+static ssize_t ping_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct qmi_handle *qmi = file->private_data;
+ struct test_ping_req_msg_v01 req = {};
+ struct qmi_txn txn;
+ int ret;
+
+ memcpy(req.ping, "ping", sizeof(req.ping));
+
+ ret = qmi_txn_init(qmi, &txn, NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ ret = qmi_send_request(qmi, NULL, &txn,
+ TEST_PING_REQ_MSG_ID_V01,
+ TEST_PING_REQ_MAX_MSG_LEN_V01,
+ test_ping_req_msg_v01_ei, &req);
+ if (ret < 0) {
+ qmi_txn_cancel(&txn);
+ return ret;
+ }
+
+ ret = qmi_txn_wait(&txn, 5 * HZ);
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+static const struct file_operations ping_fops = {
+ .open = simple_open,
+ .write = ping_write,
+};
+
+static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
+ struct qmi_txn *txn, const void *data)
+{
+ const struct test_ping_resp_msg_v01 *resp = data;
+
+ if (!txn) {
+ pr_err("spurious ping response\n");
+ return;
+ }
+
+ if (resp->resp.result == QMI_RESULT_FAILURE_V01)
+ txn->result = -ENXIO;
+ else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4))
+ txn->result = -EINVAL;
+
+ complete(&txn->completion);
+}
+
+/*
+ * data_write() - data debugfs file write handler
+ * @file: debugfs file context
+ * @user_buf: reference to the user data
+ * @count: number of bytes in @user_buf
+ * @ppos: offset in @file to write
+ *
+ * This function allows user space to send out a data QMI encoded message to
+ * the associated remote test service and will return with the result of the
+ * transaction. It serves as an example of how to have the QMI helpers decode a
+ * transaction response into a provided object automatically.
+ *
+ * Return: @count, or negative errno on failure.
+ */
+static ssize_t data_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+
+{
+ struct qmi_handle *qmi = file->private_data;
+ struct test_data_resp_msg_v01 *resp;
+ struct test_data_req_msg_v01 *req;
+ struct qmi_txn txn;
+ int ret;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ resp = kzalloc(sizeof(*resp), GFP_KERNEL);
+ if (!resp) {
+ kfree(req);
+ return -ENOMEM;
+ }
+
+ req->data_len = min_t(size_t, sizeof(req->data), count);
+ if (copy_from_user(req->data, user_buf, req->data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp);
+ if (ret < 0)
+ goto out;
+
+ ret = qmi_send_request(qmi, NULL, &txn,
+ TEST_DATA_REQ_MSG_ID_V01,
+ TEST_DATA_REQ_MAX_MSG_LEN_V01,
+ test_data_req_msg_v01_ei, req);
+ if (ret < 0) {
+ qmi_txn_cancel(&txn);
+ goto out;
+ }
+
+ ret = qmi_txn_wait(&txn, 5 * HZ);
+ if (ret < 0) {
+ goto out;
+ } else if (!resp->data_valid ||
+ resp->data_len != req->data_len ||
+ memcmp(resp->data, req->data, req->data_len)) {
+ pr_err("response data doesn't match expectation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = count;
+
+out:
+ kfree(resp);
+ kfree(req);
+
+ return ret;
+}
+
+static const struct file_operations data_fops = {
+ .open = simple_open,
+ .write = data_write,
+};
+
+static const struct qmi_msg_handler qmi_sample_handlers[] = {
+ {
+ .type = QMI_RESPONSE,
+ .msg_id = TEST_PING_REQ_MSG_ID_V01,
+ .ei = test_ping_resp_msg_v01_ei,
+ .decoded_size = sizeof(struct test_ping_req_msg_v01),
+ .fn = ping_pong_cb
+ },
+ {}
+};
+
+struct qmi_sample {
+ struct qmi_handle qmi;
+
+ struct dentry *de_dir;
+ struct dentry *de_data;
+ struct dentry *de_ping;
+};
+
+static struct dentry *qmi_debug_dir;
+
+static int qmi_sample_probe(struct platform_device *pdev)
+{
+ struct sockaddr_qrtr *sq;
+ struct qmi_sample *sample;
+ char path[20];
+ int ret;
+
+ sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL);
+ if (!sample)
+ return -ENOMEM;
+
+ ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01,
+ NULL,
+ qmi_sample_handlers);
+ if (ret < 0)
+ return ret;
+
+ sq = dev_get_platdata(&pdev->dev);
+ ret = kernel_connect(sample->qmi.sock, (struct sockaddr_unsized *)sq,
+ sizeof(*sq), 0);
+ if (ret < 0) {
+ pr_err("failed to connect to remote service port\n");
+ goto err_release_qmi_handle;
+ }
+
+ snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port);
+
+ sample->de_dir = debugfs_create_dir(path, qmi_debug_dir);
+ if (IS_ERR(sample->de_dir)) {
+ ret = PTR_ERR(sample->de_dir);
+ goto err_release_qmi_handle;
+ }
+
+ sample->de_data = debugfs_create_file("data", 0600, sample->de_dir,
+ sample, &data_fops);
+ if (IS_ERR(sample->de_data)) {
+ ret = PTR_ERR(sample->de_data);
+ goto err_remove_de_dir;
+ }
+
+ sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir,
+ sample, &ping_fops);
+ if (IS_ERR(sample->de_ping)) {
+ ret = PTR_ERR(sample->de_ping);
+ goto err_remove_de_data;
+ }
+
+ platform_set_drvdata(pdev, sample);
+
+ return 0;
+
+err_remove_de_data:
+ debugfs_remove(sample->de_data);
+err_remove_de_dir:
+ debugfs_remove(sample->de_dir);
+err_release_qmi_handle:
+ qmi_handle_release(&sample->qmi);
+
+ return ret;
+}
+
+static void qmi_sample_remove(struct platform_device *pdev)
+{
+ struct qmi_sample *sample = platform_get_drvdata(pdev);
+
+ debugfs_remove(sample->de_ping);
+ debugfs_remove(sample->de_data);
+ debugfs_remove(sample->de_dir);
+
+ qmi_handle_release(&sample->qmi);
+}
+
+static struct platform_driver qmi_sample_driver = {
+ .probe = qmi_sample_probe,
+ .remove = qmi_sample_remove,
+ .driver = {
+ .name = "qmi_sample_client",
+ },
+};
+
+static int qmi_sample_new_server(struct qmi_handle *qmi,
+ struct qmi_service *service)
+{
+ struct platform_device *pdev;
+ struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port };
+ int ret;
+
+ pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO);
+ if (!pdev)
+ return -ENOMEM;
+
+ ret = platform_device_add_data(pdev, &sq, sizeof(sq));
+ if (ret)
+ goto err_put_device;
+
+ ret = platform_device_add(pdev);
+ if (ret)
+ goto err_put_device;
+
+ service->priv = pdev;
+
+ return 0;
+
+err_put_device:
+ platform_device_put(pdev);
+
+ return ret;
+}
+
+static void qmi_sample_del_server(struct qmi_handle *qmi,
+ struct qmi_service *service)
+{
+ struct platform_device *pdev = service->priv;
+
+ platform_device_unregister(pdev);
+}
+
+static struct qmi_handle lookup_client;
+
+static const struct qmi_ops lookup_ops = {
+ .new_server = qmi_sample_new_server,
+ .del_server = qmi_sample_del_server,
+};
+
+static int qmi_sample_init(void)
+{
+ int ret;
+
+ qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL);
+ if (IS_ERR(qmi_debug_dir)) {
+ pr_err("failed to create qmi_sample dir\n");
+ return PTR_ERR(qmi_debug_dir);
+ }
+
+ ret = platform_driver_register(&qmi_sample_driver);
+ if (ret)
+ goto err_remove_debug_dir;
+
+ ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL);
+ if (ret < 0)
+ goto err_unregister_driver;
+
+ qmi_add_lookup(&lookup_client, 15, 0, 0);
+
+ return 0;
+
+err_unregister_driver:
+ platform_driver_unregister(&qmi_sample_driver);
+err_remove_debug_dir:
+ debugfs_remove(qmi_debug_dir);
+
+ return ret;
+}
+
+static void qmi_sample_exit(void)
+{
+ qmi_handle_release(&lookup_client);
+
+ platform_driver_unregister(&qmi_sample_driver);
+
+ debugfs_remove(qmi_debug_dir);
+}
+
+module_init(qmi_sample_init);
+module_exit(qmi_sample_exit);
+
+MODULE_DESCRIPTION("Sample QMI client driver");
+MODULE_LICENSE("GPL v2");
diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile
index 2d4973c69663..ddf9a5d13cad 100644
--- a/samples/rpmsg/Makefile
+++ b/samples/rpmsg/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o
diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c
index 59b13440813d..ae5081662283 100644
--- a/samples/rpmsg/rpmsg_client_sample.c
+++ b/samples/rpmsg/rpmsg_client_sample.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Remote processor messaging - sample client driver
*
@@ -6,15 +7,6 @@
*
* Ohad Ben-Cohen <ohad@wizery.com>
* Brian Swetland <swetland@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/kernel.h>
@@ -22,40 +14,56 @@
#include <linux/rpmsg.h>
#define MSG "hello world!"
-#define MSG_LIMIT 100
-static void rpmsg_sample_cb(struct rpmsg_channel *rpdev, void *data, int len,
+static int count = 100;
+module_param(count, int, 0644);
+
+struct instance_data {
+ int rx_count;
+};
+
+static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len,
void *priv, u32 src)
{
int ret;
- static int rx_count;
+ struct instance_data *idata = dev_get_drvdata(&rpdev->dev);
- dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n", ++rx_count, src);
+ dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n",
+ ++idata->rx_count, src);
- print_hex_dump(KERN_DEBUG, __func__, DUMP_PREFIX_NONE, 16, 1,
- data, len, true);
+ print_hex_dump_debug(__func__, DUMP_PREFIX_NONE, 16, 1, data, len,
+ true);
/* samples should not live forever */
- if (rx_count >= MSG_LIMIT) {
+ if (idata->rx_count >= count) {
dev_info(&rpdev->dev, "goodbye!\n");
- return;
+ return 0;
}
/* send a new message now */
- ret = rpmsg_send(rpdev, MSG, strlen(MSG));
+ ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
if (ret)
dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
+
+ return 0;
}
-static int rpmsg_sample_probe(struct rpmsg_channel *rpdev)
+static int rpmsg_sample_probe(struct rpmsg_device *rpdev)
{
int ret;
+ struct instance_data *idata;
dev_info(&rpdev->dev, "new channel: 0x%x -> 0x%x!\n",
rpdev->src, rpdev->dst);
+ idata = devm_kzalloc(&rpdev->dev, sizeof(*idata), GFP_KERNEL);
+ if (!idata)
+ return -ENOMEM;
+
+ dev_set_drvdata(&rpdev->dev, idata);
+
/* send a message to our remote processor */
- ret = rpmsg_send(rpdev, MSG, strlen(MSG));
+ ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
if (ret) {
dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
return ret;
@@ -64,7 +72,7 @@ static int rpmsg_sample_probe(struct rpmsg_channel *rpdev)
return 0;
}
-static void rpmsg_sample_remove(struct rpmsg_channel *rpdev)
+static void rpmsg_sample_remove(struct rpmsg_device *rpdev)
{
dev_info(&rpdev->dev, "rpmsg sample client driver is removed\n");
}
@@ -77,24 +85,12 @@ MODULE_DEVICE_TABLE(rpmsg, rpmsg_driver_sample_id_table);
static struct rpmsg_driver rpmsg_sample_client = {
.drv.name = KBUILD_MODNAME,
- .drv.owner = THIS_MODULE,
.id_table = rpmsg_driver_sample_id_table,
.probe = rpmsg_sample_probe,
.callback = rpmsg_sample_cb,
.remove = rpmsg_sample_remove,
};
-
-static int __init rpmsg_client_sample_init(void)
-{
- return register_rpmsg_driver(&rpmsg_sample_client);
-}
-module_init(rpmsg_client_sample_init);
-
-static void __exit rpmsg_client_sample_fini(void)
-{
- unregister_rpmsg_driver(&rpmsg_sample_client);
-}
-module_exit(rpmsg_client_sample_fini);
+module_rpmsg_driver(rpmsg_sample_client);
MODULE_DESCRIPTION("Remote processor messaging sample client driver");
MODULE_LICENSE("GPL v2");
diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig
new file mode 100644
index 000000000000..3efa51bfc8ef
--- /dev/null
+++ b/samples/rust/Kconfig
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menuconfig SAMPLES_RUST
+ bool "Rust samples"
+ depends on RUST
+ help
+ You can build sample Rust kernel code here.
+
+ If unsure, say N.
+
+if SAMPLES_RUST
+
+config SAMPLE_RUST_CONFIGFS
+ tristate "Configfs sample"
+ depends on CONFIGFS_FS
+ help
+ This option builds the Rust configfs sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_configfs.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_MINIMAL
+ tristate "Minimal"
+ help
+ This option builds the Rust minimal module sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_minimal.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_MISC_DEVICE
+ tristate "Misc device"
+ help
+ This option builds the Rust misc device.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_misc_device.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_PRINT
+ tristate "Printing macros"
+ help
+ This option builds the Rust printing macros sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_print.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DMA
+ tristate "DMA Test Driver"
+ depends on PCI
+ help
+ This option builds the Rust DMA Test driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_dma.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DEBUGFS
+ tristate "DebugFS Test Module"
+ depends on DEBUG_FS
+ help
+ This option builds the Rust DebugFS Test module sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_debugfs.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DEBUGFS_SCOPED
+ tristate "Scoped DebugFS Test Module"
+ depends on DEBUG_FS
+ help
+ This option builds the Rust Scoped DebugFS Test module sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_debugfs_scoped.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_I2C
+ tristate "I2C Driver"
+ depends on I2C=y
+ help
+ This option builds the Rust I2C driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_i2c.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_I2C_CLIENT
+ tristate "I2C Client Registration"
+ depends on I2C=y
+ help
+ This option builds the Rust I2C client manual creation
+ sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_i2c_client.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_PCI
+ tristate "PCI Driver"
+ depends on PCI
+ help
+ This option builds the Rust PCI driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_pci.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_PLATFORM
+ tristate "Platform Driver"
+ help
+ This option builds the Rust Platform driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_platform.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_USB
+ tristate "USB Driver"
+ depends on USB = y
+ help
+ This option builds the Rust USB driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_usb.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_FAUX
+ tristate "Faux Driver"
+ help
+ This option builds the Rust Faux driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_faux.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_DRIVER_AUXILIARY
+ tristate "Auxiliary Driver"
+ depends on PCI
+ select AUXILIARY_BUS
+ help
+ This option builds the Rust auxiliary driver sample.
+
+ To compile this as a module, choose M here:
+ the module will be called rust_driver_auxiliary.
+
+ If unsure, say N.
+
+config SAMPLE_RUST_HOSTPROGS
+ bool "Host programs"
+ help
+ This option builds the Rust host program samples.
+
+ If unsure, say N.
+
+endif # SAMPLES_RUST
diff --git a/samples/rust/Makefile b/samples/rust/Makefile
new file mode 100644
index 000000000000..f65885d1d62b
--- /dev/null
+++ b/samples/rust/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I$(src) # needed for trace events
+
+obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o
+obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o
+obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o
+obj-$(CONFIG_SAMPLE_RUST_DEBUGFS) += rust_debugfs.o
+obj-$(CONFIG_SAMPLE_RUST_DEBUGFS_SCOPED) += rust_debugfs_scoped.o
+obj-$(CONFIG_SAMPLE_RUST_DMA) += rust_dma.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_I2C) += rust_driver_i2c.o
+obj-$(CONFIG_SAMPLE_RUST_I2C_CLIENT) += rust_i2c_client.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_USB) += rust_driver_usb.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_FAUX) += rust_driver_faux.o
+obj-$(CONFIG_SAMPLE_RUST_DRIVER_AUXILIARY) += rust_driver_auxiliary.o
+obj-$(CONFIG_SAMPLE_RUST_CONFIGFS) += rust_configfs.o
+
+rust_print-y := rust_print_main.o rust_print_events.o
+
+subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS) += hostprogs
diff --git a/samples/rust/hostprogs/.gitignore b/samples/rust/hostprogs/.gitignore
new file mode 100644
index 000000000000..a6c173da5048
--- /dev/null
+++ b/samples/rust/hostprogs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+single
diff --git a/samples/rust/hostprogs/Makefile b/samples/rust/hostprogs/Makefile
new file mode 100644
index 000000000000..8ddcbd7416db
--- /dev/null
+++ b/samples/rust/hostprogs/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+hostprogs-always-y := single
+
+single-rust := y
diff --git a/samples/rust/hostprogs/a.rs b/samples/rust/hostprogs/a.rs
new file mode 100644
index 000000000000..f7a4a3d0f4e0
--- /dev/null
+++ b/samples/rust/hostprogs/a.rs
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample: module `a`.
+
+pub(crate) fn f(x: i32) {
+ println!("The number is {}.", x);
+}
diff --git a/samples/rust/hostprogs/b.rs b/samples/rust/hostprogs/b.rs
new file mode 100644
index 000000000000..c1675890648f
--- /dev/null
+++ b/samples/rust/hostprogs/b.rs
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample: module `b`.
+
+pub(crate) const CONSTANT: i32 = 42;
diff --git a/samples/rust/hostprogs/single.rs b/samples/rust/hostprogs/single.rs
new file mode 100644
index 000000000000..8c48a119339a
--- /dev/null
+++ b/samples/rust/hostprogs/single.rs
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample.
+
+mod a;
+mod b;
+
+fn main() {
+ println!("Hello world!");
+
+ a::f(b::CONSTANT);
+}
diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs
new file mode 100644
index 000000000000..0ccc7553ef39
--- /dev/null
+++ b/samples/rust/rust_configfs.rs
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust configfs sample.
+
+use kernel::alloc::flags;
+use kernel::c_str;
+use kernel::configfs;
+use kernel::configfs::configfs_attrs;
+use kernel::new_mutex;
+use kernel::page::PAGE_SIZE;
+use kernel::prelude::*;
+use kernel::sync::Mutex;
+
+module! {
+ type: RustConfigfs,
+ name: "rust_configfs",
+ authors: ["Rust for Linux Contributors"],
+ description: "Rust configfs sample",
+ license: "GPL",
+}
+
+#[pin_data]
+struct RustConfigfs {
+ #[pin]
+ config: configfs::Subsystem<Configuration>,
+}
+
+#[pin_data]
+struct Configuration {
+ message: &'static CStr,
+ #[pin]
+ bar: Mutex<(KBox<[u8; PAGE_SIZE]>, usize)>,
+}
+
+impl Configuration {
+ fn new() -> impl PinInit<Self, Error> {
+ try_pin_init!(Self {
+ message: c_str!("Hello World\n"),
+ bar <- new_mutex!((KBox::new([0; PAGE_SIZE], flags::GFP_KERNEL)?, 0)),
+ })
+ }
+}
+
+impl kernel::InPlaceModule for RustConfigfs {
+ fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
+ pr_info!("Rust configfs sample (init)\n");
+
+ // Define a subsystem with the data type `Configuration`, two
+ // attributes, `message` and `bar` and child group type `Child`. `mkdir`
+ // in the directory representing this subsystem will create directories
+ // backed by the `Child` type.
+ let item_type = configfs_attrs! {
+ container: configfs::Subsystem<Configuration>,
+ data: Configuration,
+ child: Child,
+ attributes: [
+ message: 0,
+ bar: 1,
+ ],
+ };
+
+ try_pin_init!(Self {
+ config <- configfs::Subsystem::new(
+ c_str!("rust_configfs"), item_type, Configuration::new()
+ ),
+ })
+ }
+}
+
+#[vtable]
+impl configfs::GroupOperations for Configuration {
+ type Child = Child;
+
+ fn make_group(&self, name: &CStr) -> Result<impl PinInit<configfs::Group<Child>, Error>> {
+ // Define a group with data type `Child`, one attribute `baz` and child
+ // group type `GrandChild`. `mkdir` in the directory representing this
+ // group will create directories backed by the `GrandChild` type.
+ let tpe = configfs_attrs! {
+ container: configfs::Group<Child>,
+ data: Child,
+ child: GrandChild,
+ attributes: [
+ baz: 0,
+ ],
+ };
+
+ Ok(configfs::Group::new(name.try_into()?, tpe, Child::new()))
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<0> for Configuration {
+ type Data = Configuration;
+
+ fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ pr_info!("Show message\n");
+ let data = container.message.to_bytes();
+ page[0..data.len()].copy_from_slice(data);
+ Ok(data.len())
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<1> for Configuration {
+ type Data = Configuration;
+
+ fn show(container: &Configuration, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ pr_info!("Show bar\n");
+ let guard = container.bar.lock();
+ let data = guard.0.as_slice();
+ let len = guard.1;
+ page[0..len].copy_from_slice(&data[0..len]);
+ Ok(len)
+ }
+
+ fn store(container: &Configuration, page: &[u8]) -> Result {
+ pr_info!("Store bar\n");
+ let mut guard = container.bar.lock();
+ guard.0[0..page.len()].copy_from_slice(page);
+ guard.1 = page.len();
+ Ok(())
+ }
+}
+
+// `pin_data` cannot handle structs without braces.
+#[pin_data]
+struct Child {}
+
+impl Child {
+ fn new() -> impl PinInit<Self, Error> {
+ try_pin_init!(Self {})
+ }
+}
+
+#[vtable]
+impl configfs::GroupOperations for Child {
+ type Child = GrandChild;
+
+ fn make_group(&self, name: &CStr) -> Result<impl PinInit<configfs::Group<GrandChild>, Error>> {
+ // Define a group with data type `GrandChild`, one attribute `gc`. As no
+ // child type is specified, it will not be possible to create subgroups
+ // in this group, and `mkdir`in the directory representing this group
+ // will return an error.
+ let tpe = configfs_attrs! {
+ container: configfs::Group<GrandChild>,
+ data: GrandChild,
+ attributes: [
+ gc: 0,
+ ],
+ };
+
+ Ok(configfs::Group::new(
+ name.try_into()?,
+ tpe,
+ GrandChild::new(),
+ ))
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<0> for Child {
+ type Data = Child;
+
+ fn show(_container: &Child, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ pr_info!("Show baz\n");
+ let data = c"Hello Baz\n".to_bytes();
+ page[0..data.len()].copy_from_slice(data);
+ Ok(data.len())
+ }
+}
+
+// `pin_data` cannot handle structs without braces.
+#[pin_data]
+struct GrandChild {}
+
+impl GrandChild {
+ fn new() -> impl PinInit<Self, Error> {
+ try_pin_init!(Self {})
+ }
+}
+
+#[vtable]
+impl configfs::AttributeOperations<0> for GrandChild {
+ type Data = GrandChild;
+
+ fn show(_container: &GrandChild, page: &mut [u8; PAGE_SIZE]) -> Result<usize> {
+ pr_info!("Show grand child\n");
+ let data = c"Hello GC\n".to_bytes();
+ page[0..data.len()].copy_from_slice(data);
+ Ok(data.len())
+ }
+}
diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs
new file mode 100644
index 000000000000..025e8f9d12de
--- /dev/null
+++ b/samples/rust/rust_debugfs.rs
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2025 Google LLC.
+
+//! Sample DebugFS exporting platform driver
+//!
+//! To successfully probe this driver with ACPI, use an ssdt that looks like
+//!
+//! ```dsl
+//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001)
+//! {
+//! Scope (\_SB)
+//! {
+//! Device (T432)
+//! {
+//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match
+//! Name (_UID, 1)
+//! Name (_STA, 0x0F) // Device present, enabled
+//! Name (_DSD, Package () { // Sample attribute
+//! ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+//! Package() {
+//! Package(2) {"compatible", "sample-debugfs"}
+//! }
+//! })
+//! Name (_CRS, ResourceTemplate ()
+//! {
+//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000)
+//! })
+//! }
+//! }
+//! }
+//! ```
+
+use core::str::FromStr;
+use kernel::c_str;
+use kernel::debugfs::{Dir, File};
+use kernel::new_mutex;
+use kernel::prelude::*;
+use kernel::sizes::*;
+use kernel::sync::atomic::{Atomic, Relaxed};
+use kernel::sync::Mutex;
+use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef};
+
+kernel::module_platform_driver! {
+ type: RustDebugFs,
+ name: "rust_debugfs",
+ authors: ["Matthew Maurer"],
+ description: "Rust DebugFS usage sample",
+ license: "GPL",
+}
+
+#[pin_data]
+struct RustDebugFs {
+ pdev: ARef<platform::Device>,
+ // As we only hold these for drop effect (to remove the directory/files) we have a leading
+ // underscore to indicate to the compiler that we don't expect to use this field directly.
+ _debugfs: Dir,
+ #[pin]
+ _compatible: File<CString>,
+ #[pin]
+ counter: File<Atomic<usize>>,
+ #[pin]
+ inner: File<Mutex<Inner>>,
+ #[pin]
+ array_blob: File<Mutex<[u8; 4]>>,
+ #[pin]
+ vector_blob: File<Mutex<KVec<u8>>>,
+}
+
+#[derive(Debug)]
+struct Inner {
+ x: u32,
+ y: u32,
+}
+
+impl FromStr for Inner {
+ type Err = Error;
+ fn from_str(s: &str) -> Result<Self> {
+ let mut parts = s.split_whitespace();
+ let x = parts
+ .next()
+ .ok_or(EINVAL)?
+ .parse::<u32>()
+ .map_err(|_| EINVAL)?;
+ let y = parts
+ .next()
+ .ok_or(EINVAL)?
+ .parse::<u32>()
+ .map_err(|_| EINVAL)?;
+ if parts.next().is_some() {
+ return Err(EINVAL);
+ }
+ Ok(Inner { x, y })
+ }
+}
+
+kernel::acpi_device_table!(
+ ACPI_TABLE,
+ MODULE_ACPI_TABLE,
+ <RustDebugFs as platform::Driver>::IdInfo,
+ [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())]
+);
+
+impl platform::Driver for RustDebugFs {
+ type IdInfo = ();
+ const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = None;
+ const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE);
+
+ fn probe(
+ pdev: &platform::Device<Core>,
+ _info: Option<&Self::IdInfo>,
+ ) -> impl PinInit<Self, Error> {
+ RustDebugFs::new(pdev).pin_chain(|this| {
+ this.counter.store(91, Relaxed);
+ {
+ let mut guard = this.inner.lock();
+ guard.x = guard.y;
+ guard.y = 42;
+ }
+
+ Ok(())
+ })
+ }
+}
+
+impl RustDebugFs {
+ fn build_counter(dir: &Dir) -> impl PinInit<File<Atomic<usize>>> + '_ {
+ dir.read_write_file(c_str!("counter"), Atomic::<usize>::new(0))
+ }
+
+ fn build_inner(dir: &Dir) -> impl PinInit<File<Mutex<Inner>>> + '_ {
+ dir.read_write_file(c_str!("pair"), new_mutex!(Inner { x: 3, y: 10 }))
+ }
+
+ fn new(pdev: &platform::Device<Core>) -> impl PinInit<Self, Error> + '_ {
+ let debugfs = Dir::new(c_str!("sample_debugfs"));
+ let dev = pdev.as_ref();
+
+ try_pin_init! {
+ Self {
+ _compatible <- debugfs.read_only_file(
+ c_str!("compatible"),
+ dev.fwnode()
+ .ok_or(ENOENT)?
+ .property_read::<CString>(c_str!("compatible"))
+ .required_by(dev)?,
+ ),
+ counter <- Self::build_counter(&debugfs),
+ inner <- Self::build_inner(&debugfs),
+ array_blob <- debugfs.read_write_binary_file(
+ c_str!("array_blob"),
+ new_mutex!([0x62, 0x6c, 0x6f, 0x62]),
+ ),
+ vector_blob <- debugfs.read_write_binary_file(
+ c_str!("vector_blob"),
+ new_mutex!(kernel::kvec!(0x42; SZ_4K)?),
+ ),
+ _debugfs: debugfs,
+ pdev: pdev.into(),
+ }
+ }
+ }
+}
diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs
new file mode 100644
index 000000000000..702a6546d3fb
--- /dev/null
+++ b/samples/rust/rust_debugfs_scoped.rs
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2025 Google LLC.
+
+//! Sample DebugFS exporting platform driver that demonstrates the use of
+//! `Scope::dir` to create a variety of files without the need to separately
+//! track them all.
+
+use kernel::debugfs::{Dir, Scope};
+use kernel::prelude::*;
+use kernel::sizes::*;
+use kernel::sync::atomic::Atomic;
+use kernel::sync::Mutex;
+use kernel::{c_str, new_mutex, str::CString};
+
+module! {
+ type: RustScopedDebugFs,
+ name: "rust_debugfs_scoped",
+ authors: ["Matthew Maurer"],
+ description: "Rust Scoped DebugFS usage sample",
+ license: "GPL",
+}
+
+fn remove_file_write(
+ mod_data: &ModuleData,
+ reader: &mut kernel::uaccess::UserSliceReader,
+) -> Result {
+ let mut buf = [0u8; 128];
+ if reader.len() >= buf.len() {
+ return Err(EINVAL);
+ }
+ let n = reader.len();
+ reader.read_slice(&mut buf[..n])?;
+
+ let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim();
+ let nul_idx = s.len();
+ buf[nul_idx] = 0;
+ let to_remove = CStr::from_bytes_with_nul(&buf[..nul_idx + 1]).map_err(|_| EINVAL)?;
+ mod_data
+ .devices
+ .lock()
+ .retain(|device| device.name.to_bytes() != to_remove.to_bytes());
+ Ok(())
+}
+
+fn create_file_write(
+ mod_data: &ModuleData,
+ reader: &mut kernel::uaccess::UserSliceReader,
+) -> Result {
+ let mut buf = [0u8; 128];
+ if reader.len() > buf.len() {
+ return Err(EINVAL);
+ }
+ let n = reader.len();
+ reader.read_slice(&mut buf[..n])?;
+
+ let mut nums = KVec::new();
+
+ let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?.trim();
+ let mut items = s.split_whitespace();
+ let name_str = items.next().ok_or(EINVAL)?;
+ let name = CString::try_from_fmt(fmt!("{name_str}"))?;
+ let file_name = CString::try_from_fmt(fmt!("{name_str}"))?;
+ for sub in items {
+ nums.push(
+ Atomic::<usize>::new(sub.parse().map_err(|_| EINVAL)?),
+ GFP_KERNEL,
+ )?;
+ }
+ let blob = KBox::pin_init(new_mutex!([0x42; SZ_4K]), GFP_KERNEL)?;
+
+ let scope = KBox::pin_init(
+ mod_data.device_dir.scope(
+ DeviceData { name, nums, blob },
+ &file_name,
+ |dev_data, dir| {
+ for (idx, val) in dev_data.nums.iter().enumerate() {
+ let Ok(name) = CString::try_from_fmt(fmt!("{idx}")) else {
+ return;
+ };
+ dir.read_write_file(&name, val);
+ }
+ dir.read_write_binary_file(c_str!("blob"), &dev_data.blob);
+ },
+ ),
+ GFP_KERNEL,
+ )?;
+ (*mod_data.devices.lock()).push(scope, GFP_KERNEL)?;
+
+ Ok(())
+}
+
+struct RustScopedDebugFs {
+ _data: Pin<KBox<Scope<ModuleData>>>,
+}
+
+#[pin_data]
+struct ModuleData {
+ device_dir: Dir,
+ #[pin]
+ devices: Mutex<KVec<Pin<KBox<Scope<DeviceData>>>>>,
+}
+
+impl ModuleData {
+ fn init(device_dir: Dir) -> impl PinInit<Self> {
+ pin_init! {
+ Self {
+ device_dir: device_dir,
+ devices <- new_mutex!(KVec::new())
+ }
+ }
+ }
+}
+
+struct DeviceData {
+ name: CString,
+ nums: KVec<Atomic<usize>>,
+ blob: Pin<KBox<Mutex<[u8; SZ_4K]>>>,
+}
+
+fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit<Scope<ModuleData>> + '_ {
+ base_dir.scope(
+ ModuleData::init(dyn_dirs),
+ c_str!("control"),
+ |data, dir| {
+ dir.write_only_callback_file(c_str!("create"), data, &create_file_write);
+ dir.write_only_callback_file(c_str!("remove"), data, &remove_file_write);
+ },
+ )
+}
+
+impl kernel::Module for RustScopedDebugFs {
+ fn init(_module: &'static kernel::ThisModule) -> Result<Self> {
+ let base_dir = Dir::new(c_str!("rust_scoped_debugfs"));
+ let dyn_dirs = base_dir.subdir(c_str!("dynamic"));
+ Ok(Self {
+ _data: KBox::pin_init(init_control(&base_dir, dyn_dirs), GFP_KERNEL)?,
+ })
+ }
+}
diff --git a/samples/rust/rust_dma.rs b/samples/rust/rust_dma.rs
new file mode 100644
index 000000000000..f53bce2a73e3
--- /dev/null
+++ b/samples/rust/rust_dma.rs
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust DMA api test (based on QEMU's `pci-testdev`).
+//!
+//! To make this driver probe, QEMU must be run with `-device pci-testdev`.
+
+use kernel::{
+ device::Core,
+ dma::{CoherentAllocation, DataDirection, Device, DmaMask},
+ page, pci,
+ prelude::*,
+ scatterlist::{Owned, SGTable},
+ sync::aref::ARef,
+};
+
+#[pin_data(PinnedDrop)]
+struct DmaSampleDriver {
+ pdev: ARef<pci::Device>,
+ ca: CoherentAllocation<MyStruct>,
+ #[pin]
+ sgt: SGTable<Owned<VVec<u8>>>,
+}
+
+const TEST_VALUES: [(u32, u32); 5] = [
+ (0xa, 0xb),
+ (0xc, 0xd),
+ (0xe, 0xf),
+ (0xab, 0xba),
+ (0xcd, 0xef),
+];
+
+struct MyStruct {
+ h: u32,
+ b: u32,
+}
+
+impl MyStruct {
+ fn new(h: u32, b: u32) -> Self {
+ Self { h, b }
+ }
+}
+// SAFETY: All bit patterns are acceptable values for `MyStruct`.
+unsafe impl kernel::transmute::AsBytes for MyStruct {}
+// SAFETY: Instances of `MyStruct` have no uninitialized portions.
+unsafe impl kernel::transmute::FromBytes for MyStruct {}
+
+kernel::pci_device_table!(
+ PCI_TABLE,
+ MODULE_PCI_TABLE,
+ <DmaSampleDriver as pci::Driver>::IdInfo,
+ [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())]
+);
+
+impl pci::Driver for DmaSampleDriver {
+ type IdInfo = ();
+ const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE;
+
+ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> {
+ pin_init::pin_init_scope(move || {
+ dev_info!(pdev.as_ref(), "Probe DMA test driver.\n");
+
+ let mask = DmaMask::new::<64>();
+
+ // SAFETY: There are no concurrent calls to DMA allocation and mapping primitives.
+ unsafe { pdev.dma_set_mask_and_coherent(mask)? };
+
+ let ca: CoherentAllocation<MyStruct> =
+ CoherentAllocation::alloc_coherent(pdev.as_ref(), TEST_VALUES.len(), GFP_KERNEL)?;
+
+ for (i, value) in TEST_VALUES.into_iter().enumerate() {
+ kernel::dma_write!(ca[i] = MyStruct::new(value.0, value.1))?;
+ }
+
+ let size = 4 * page::PAGE_SIZE;
+ let pages = VVec::with_capacity(size, GFP_KERNEL)?;
+
+ let sgt = SGTable::new(pdev.as_ref(), pages, DataDirection::ToDevice, GFP_KERNEL);
+
+ Ok(try_pin_init!(Self {
+ pdev: pdev.into(),
+ ca,
+ sgt <- sgt,
+ }))
+ })
+ }
+}
+
+#[pinned_drop]
+impl PinnedDrop for DmaSampleDriver {
+ fn drop(self: Pin<&mut Self>) {
+ let dev = self.pdev.as_ref();
+
+ dev_info!(dev, "Unload DMA test driver.\n");
+
+ for (i, value) in TEST_VALUES.into_iter().enumerate() {
+ let val0 = kernel::dma_read!(self.ca[i].h);
+ let val1 = kernel::dma_read!(self.ca[i].b);
+ assert!(val0.is_ok());
+ assert!(val1.is_ok());
+
+ if let Ok(val0) = val0 {
+ assert_eq!(val0, value.0);
+ }
+ if let Ok(val1) = val1 {
+ assert_eq!(val1, value.1);
+ }
+ }
+
+ for (i, entry) in self.sgt.iter().enumerate() {
+ dev_info!(dev, "Entry[{}]: DMA address: {:#x}", i, entry.dma_address());
+ }
+ }
+}
+
+kernel::module_pci_driver! {
+ type: DmaSampleDriver,
+ name: "rust_dma",
+ authors: ["Abdiel Janulgue"],
+ description: "Rust DMA test",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_driver_auxiliary.rs b/samples/rust/rust_driver_auxiliary.rs
new file mode 100644
index 000000000000..5761ea314f44
--- /dev/null
+++ b/samples/rust/rust_driver_auxiliary.rs
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust auxiliary driver sample (based on a PCI driver for QEMU's `pci-testdev`).
+//!
+//! To make this driver probe, QEMU must be run with `-device pci-testdev`.
+
+use kernel::{
+ auxiliary, c_str,
+ device::{Bound, Core},
+ devres::Devres,
+ driver,
+ error::Error,
+ pci,
+ prelude::*,
+ InPlaceModule,
+};
+
+use core::any::TypeId;
+use pin_init::PinInit;
+
+const MODULE_NAME: &CStr = <LocalModule as kernel::ModuleMetadata>::NAME;
+const AUXILIARY_NAME: &CStr = c_str!("auxiliary");
+
+struct AuxiliaryDriver;
+
+kernel::auxiliary_device_table!(
+ AUX_TABLE,
+ MODULE_AUX_TABLE,
+ <AuxiliaryDriver as auxiliary::Driver>::IdInfo,
+ [(auxiliary::DeviceId::new(MODULE_NAME, AUXILIARY_NAME), ())]
+);
+
+impl auxiliary::Driver for AuxiliaryDriver {
+ type IdInfo = ();
+
+ const ID_TABLE: auxiliary::IdTable<Self::IdInfo> = &AUX_TABLE;
+
+ fn probe(adev: &auxiliary::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> {
+ dev_info!(
+ adev.as_ref(),
+ "Probing auxiliary driver for auxiliary device with id={}\n",
+ adev.id()
+ );
+
+ ParentDriver::connect(adev)?;
+
+ Ok(Self)
+ }
+}
+
+#[pin_data]
+struct ParentDriver {
+ private: TypeId,
+ #[pin]
+ _reg0: Devres<auxiliary::Registration>,
+ #[pin]
+ _reg1: Devres<auxiliary::Registration>,
+}
+
+kernel::pci_device_table!(
+ PCI_TABLE,
+ MODULE_PCI_TABLE,
+ <ParentDriver as pci::Driver>::IdInfo,
+ [(pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5), ())]
+);
+
+impl pci::Driver for ParentDriver {
+ type IdInfo = ();
+
+ const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE;
+
+ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, Error> {
+ try_pin_init!(Self {
+ private: TypeId::of::<Self>(),
+ _reg0 <- auxiliary::Registration::new(pdev.as_ref(), AUXILIARY_NAME, 0, MODULE_NAME),
+ _reg1 <- auxiliary::Registration::new(pdev.as_ref(), AUXILIARY_NAME, 1, MODULE_NAME),
+ })
+ }
+}
+
+impl ParentDriver {
+ fn connect(adev: &auxiliary::Device<Bound>) -> Result {
+ let dev = adev.parent();
+ let pdev: &pci::Device<Bound> = dev.try_into()?;
+ let drvdata = dev.drvdata::<Self>()?;
+
+ dev_info!(
+ dev,
+ "Connect auxiliary {} with parent: VendorID={}, DeviceID={:#x}\n",
+ adev.id(),
+ pdev.vendor_id(),
+ pdev.device_id()
+ );
+
+ dev_info!(
+ dev,
+ "We have access to the private data of {:?}.\n",
+ drvdata.private
+ );
+
+ Ok(())
+ }
+}
+
+#[pin_data]
+struct SampleModule {
+ #[pin]
+ _pci_driver: driver::Registration<pci::Adapter<ParentDriver>>,
+ #[pin]
+ _aux_driver: driver::Registration<auxiliary::Adapter<AuxiliaryDriver>>,
+}
+
+impl InPlaceModule for SampleModule {
+ fn init(module: &'static kernel::ThisModule) -> impl PinInit<Self, Error> {
+ try_pin_init!(Self {
+ _pci_driver <- driver::Registration::new(MODULE_NAME, module),
+ _aux_driver <- driver::Registration::new(MODULE_NAME, module),
+ })
+ }
+}
+
+module! {
+ type: SampleModule,
+ name: "rust_driver_auxiliary",
+ authors: ["Danilo Krummrich"],
+ description: "Rust auxiliary driver",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_driver_faux.rs b/samples/rust/rust_driver_faux.rs
new file mode 100644
index 000000000000..ecc9fd378cbd
--- /dev/null
+++ b/samples/rust/rust_driver_faux.rs
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+//! Rust faux device sample.
+
+use kernel::{c_str, faux, prelude::*, Module};
+
+module! {
+ type: SampleModule,
+ name: "rust_faux_driver",
+ authors: ["Lyude Paul"],
+ description: "Rust faux device sample",
+ license: "GPL",
+}
+
+struct SampleModule {
+ _reg: faux::Registration,
+}
+
+impl Module for SampleModule {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ pr_info!("Initialising Rust Faux Device Sample\n");
+
+ let reg = faux::Registration::new(c_str!("rust-faux-sample-device"), None)?;
+
+ dev_info!(reg.as_ref(), "Hello from faux device!\n");
+
+ Ok(Self { _reg: reg })
+ }
+}
diff --git a/samples/rust/rust_driver_i2c.rs b/samples/rust/rust_driver_i2c.rs
new file mode 100644
index 000000000000..ecefeca3e22f
--- /dev/null
+++ b/samples/rust/rust_driver_i2c.rs
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust I2C driver sample.
+
+use kernel::{
+ acpi,
+ c_str,
+ device::Core,
+ i2c,
+ of,
+ prelude::*, //
+};
+
+struct SampleDriver;
+
+kernel::acpi_device_table! {
+ ACPI_TABLE,
+ MODULE_ACPI_TABLE,
+ <SampleDriver as i2c::Driver>::IdInfo,
+ [(acpi::DeviceId::new(c_str!("LNUXBEEF")), 0)]
+}
+
+kernel::i2c_device_table! {
+ I2C_TABLE,
+ MODULE_I2C_TABLE,
+ <SampleDriver as i2c::Driver>::IdInfo,
+ [(i2c::DeviceId::new(c_str!("rust_driver_i2c")), 0)]
+}
+
+kernel::of_device_table! {
+ OF_TABLE,
+ MODULE_OF_TABLE,
+ <SampleDriver as i2c::Driver>::IdInfo,
+ [(of::DeviceId::new(c_str!("test,rust_driver_i2c")), 0)]
+}
+
+impl i2c::Driver for SampleDriver {
+ type IdInfo = u32;
+
+ const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE);
+ const I2C_ID_TABLE: Option<i2c::IdTable<Self::IdInfo>> = Some(&I2C_TABLE);
+ const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE);
+
+ fn probe(
+ idev: &i2c::I2cClient<Core>,
+ info: Option<&Self::IdInfo>,
+ ) -> impl PinInit<Self, Error> {
+ let dev = idev.as_ref();
+
+ dev_info!(dev, "Probe Rust I2C driver sample.\n");
+
+ if let Some(info) = info {
+ dev_info!(dev, "Probed with info: '{}'.\n", info);
+ }
+
+ Ok(Self)
+ }
+
+ fn shutdown(idev: &i2c::I2cClient<Core>, _this: Pin<&Self>) {
+ dev_info!(idev.as_ref(), "Shutdown Rust I2C driver sample.\n");
+ }
+
+ fn unbind(idev: &i2c::I2cClient<Core>, _this: Pin<&Self>) {
+ dev_info!(idev.as_ref(), "Unbind Rust I2C driver sample.\n");
+ }
+}
+
+kernel::module_i2c_driver! {
+ type: SampleDriver,
+ name: "rust_driver_i2c",
+ authors: ["Igor Korotin"],
+ description: "Rust I2C driver",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs
new file mode 100644
index 000000000000..5823787bea8e
--- /dev/null
+++ b/samples/rust/rust_driver_pci.rs
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust PCI driver sample (based on QEMU's `pci-testdev`).
+//!
+//! To make this driver probe, QEMU must be run with `-device pci-testdev`.
+
+use kernel::{c_str, device::Core, devres::Devres, pci, prelude::*, sync::aref::ARef};
+
+struct Regs;
+
+impl Regs {
+ const TEST: usize = 0x0;
+ const OFFSET: usize = 0x4;
+ const DATA: usize = 0x8;
+ const COUNT: usize = 0xC;
+ const END: usize = 0x10;
+}
+
+type Bar0 = pci::Bar<{ Regs::END }>;
+
+#[derive(Copy, Clone, Debug)]
+struct TestIndex(u8);
+
+impl TestIndex {
+ const NO_EVENTFD: Self = Self(0);
+}
+
+#[pin_data(PinnedDrop)]
+struct SampleDriver {
+ pdev: ARef<pci::Device>,
+ #[pin]
+ bar: Devres<Bar0>,
+ index: TestIndex,
+}
+
+kernel::pci_device_table!(
+ PCI_TABLE,
+ MODULE_PCI_TABLE,
+ <SampleDriver as pci::Driver>::IdInfo,
+ [(
+ pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5),
+ TestIndex::NO_EVENTFD
+ )]
+);
+
+impl SampleDriver {
+ fn testdev(index: &TestIndex, bar: &Bar0) -> Result<u32> {
+ // Select the test.
+ bar.write8(index.0, Regs::TEST);
+
+ let offset = u32::from_le(bar.read32(Regs::OFFSET)) as usize;
+ let data = bar.read8(Regs::DATA);
+
+ // Write `data` to `offset` to increase `count` by one.
+ //
+ // Note that we need `try_write8`, since `offset` can't be checked at compile-time.
+ bar.try_write8(data, offset)?;
+
+ Ok(bar.read32(Regs::COUNT))
+ }
+}
+
+impl pci::Driver for SampleDriver {
+ type IdInfo = TestIndex;
+
+ const ID_TABLE: pci::IdTable<Self::IdInfo> = &PCI_TABLE;
+
+ fn probe(pdev: &pci::Device<Core>, info: &Self::IdInfo) -> impl PinInit<Self, Error> {
+ pin_init::pin_init_scope(move || {
+ let vendor = pdev.vendor_id();
+ dev_dbg!(
+ pdev.as_ref(),
+ "Probe Rust PCI driver sample (PCI ID: {}, 0x{:x}).\n",
+ vendor,
+ pdev.device_id()
+ );
+
+ pdev.enable_device_mem()?;
+ pdev.set_master();
+
+ Ok(try_pin_init!(Self {
+ bar <- pdev.iomap_region_sized::<{ Regs::END }>(0, c_str!("rust_driver_pci")),
+ index: *info,
+ _: {
+ let bar = bar.access(pdev.as_ref())?;
+
+ dev_info!(
+ pdev.as_ref(),
+ "pci-testdev data-match count: {}\n",
+ Self::testdev(info, bar)?
+ );
+ },
+ pdev: pdev.into(),
+ }))
+ })
+ }
+
+ fn unbind(pdev: &pci::Device<Core>, this: Pin<&Self>) {
+ if let Ok(bar) = this.bar.access(pdev.as_ref()) {
+ // Reset pci-testdev by writing a new test index.
+ bar.write8(this.index.0, Regs::TEST);
+ }
+ }
+}
+
+#[pinned_drop]
+impl PinnedDrop for SampleDriver {
+ fn drop(self: Pin<&mut Self>) {
+ dev_dbg!(self.pdev.as_ref(), "Remove Rust PCI driver sample.\n");
+ }
+}
+
+kernel::module_pci_driver! {
+ type: SampleDriver,
+ name: "rust_driver_pci",
+ authors: ["Danilo Krummrich"],
+ description: "Rust PCI driver",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_driver_platform.rs b/samples/rust/rust_driver_platform.rs
new file mode 100644
index 000000000000..6bf4f0c9633d
--- /dev/null
+++ b/samples/rust/rust_driver_platform.rs
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust Platform driver sample.
+
+//! ACPI match table test
+//!
+//! This demonstrates how to test an ACPI-based Rust platform driver using QEMU
+//! with a custom SSDT.
+//!
+//! Steps:
+//!
+//! 1. **Create an SSDT source file** (`ssdt.dsl`) with the following content:
+//!
+//! ```asl
+//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001)
+//! {
+//! Scope (\_SB)
+//! {
+//! Device (T432)
+//! {
+//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match
+//! Name (_UID, 1)
+//! Name (_STA, 0x0F) // Device present, enabled
+//! Name (_CRS, ResourceTemplate ()
+//! {
+//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000)
+//! })
+//! }
+//! }
+//! }
+//! ```
+//!
+//! 2. **Compile the table**:
+//!
+//! ```sh
+//! iasl -tc ssdt.dsl
+//! ```
+//!
+//! This generates `ssdt.aml`
+//!
+//! 3. **Run QEMU** with the compiled AML file:
+//!
+//! ```sh
+//! qemu-system-x86_64 -m 512M \
+//! -enable-kvm \
+//! -kernel path/to/bzImage \
+//! -append "root=/dev/sda console=ttyS0" \
+//! -hda rootfs.img \
+//! -serial stdio \
+//! -acpitable file=ssdt.aml
+//! ```
+//!
+//! Requirements:
+//! - The `rust_driver_platform` must be present either:
+//! - built directly into the kernel (`bzImage`), or
+//! - available as a `.ko` file and loadable from `rootfs.img`
+//!
+//! 4. **Verify it worked** by checking `dmesg`:
+//!
+//! ```
+//! rust_driver_platform LNUXBEEF:00: Probed with info: '0'.
+//! ```
+//!
+
+use kernel::{
+ acpi, c_str,
+ device::{
+ self,
+ property::{FwNodeReferenceArgs, NArgs},
+ Core,
+ },
+ of, platform,
+ prelude::*,
+ str::CString,
+ sync::aref::ARef,
+};
+
+struct SampleDriver {
+ pdev: ARef<platform::Device>,
+}
+
+struct Info(u32);
+
+kernel::of_device_table!(
+ OF_TABLE,
+ MODULE_OF_TABLE,
+ <SampleDriver as platform::Driver>::IdInfo,
+ [(of::DeviceId::new(c_str!("test,rust-device")), Info(42))]
+);
+
+kernel::acpi_device_table!(
+ ACPI_TABLE,
+ MODULE_ACPI_TABLE,
+ <SampleDriver as platform::Driver>::IdInfo,
+ [(acpi::DeviceId::new(c_str!("LNUXBEEF")), Info(0))]
+);
+
+impl platform::Driver for SampleDriver {
+ type IdInfo = Info;
+ const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE);
+ const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE);
+
+ fn probe(
+ pdev: &platform::Device<Core>,
+ info: Option<&Self::IdInfo>,
+ ) -> impl PinInit<Self, Error> {
+ let dev = pdev.as_ref();
+
+ dev_dbg!(dev, "Probe Rust Platform driver sample.\n");
+
+ if let Some(info) = info {
+ dev_info!(dev, "Probed with info: '{}'.\n", info.0);
+ }
+
+ if dev.fwnode().is_some_and(|node| node.is_of_node()) {
+ Self::properties_parse(dev)?;
+ }
+
+ Ok(Self { pdev: pdev.into() })
+ }
+}
+
+impl SampleDriver {
+ fn properties_parse(dev: &device::Device) -> Result {
+ let fwnode = dev.fwnode().ok_or(ENOENT)?;
+
+ if let Ok(idx) =
+ fwnode.property_match_string(c_str!("compatible"), c_str!("test,rust-device"))
+ {
+ dev_info!(dev, "matched compatible string idx = {}\n", idx);
+ }
+
+ let name = c_str!("compatible");
+ let prop = fwnode.property_read::<CString>(name).required_by(dev)?;
+ dev_info!(dev, "'{name}'='{prop:?}'\n");
+
+ let name = c_str!("test,bool-prop");
+ let prop = fwnode.property_read_bool(c_str!("test,bool-prop"));
+ dev_info!(dev, "'{name}'='{prop}'\n");
+
+ if fwnode.property_present(c_str!("test,u32-prop")) {
+ dev_info!(dev, "'test,u32-prop' is present\n");
+ }
+
+ let name = c_str!("test,u32-optional-prop");
+ let prop = fwnode.property_read::<u32>(name).or(0x12);
+ dev_info!(dev, "'{name}'='{prop:#x}' (default = 0x12)\n");
+
+ // A missing required property will print an error. Discard the error to
+ // prevent properties_parse from failing in that case.
+ let name = c_str!("test,u32-required-prop");
+ let _ = fwnode.property_read::<u32>(name).required_by(dev);
+
+ let name = c_str!("test,u32-prop");
+ let prop: u32 = fwnode.property_read(name).required_by(dev)?;
+ dev_info!(dev, "'{name}'='{prop:#x}'\n");
+
+ let name = c_str!("test,i16-array");
+ let prop: [i16; 4] = fwnode.property_read(name).required_by(dev)?;
+ dev_info!(dev, "'{name}'='{prop:?}'\n");
+ let len = fwnode.property_count_elem::<u16>(name)?;
+ dev_info!(dev, "'{name}' length is {len}\n");
+
+ let name = c_str!("test,i16-array");
+ let prop: KVec<i16> = fwnode.property_read_array_vec(name, 4)?.required_by(dev)?;
+ dev_info!(dev, "'{name}'='{prop:?}' (KVec)\n");
+
+ for child in fwnode.children() {
+ let name = c_str!("test,ref-arg");
+ let nargs = NArgs::N(2);
+ let prop: FwNodeReferenceArgs = child.property_get_reference_args(name, nargs, 0)?;
+ dev_info!(dev, "'{name}'='{prop:?}'\n");
+ }
+
+ Ok(())
+ }
+}
+
+impl Drop for SampleDriver {
+ fn drop(&mut self) {
+ dev_dbg!(self.pdev.as_ref(), "Remove Rust Platform driver sample.\n");
+ }
+}
+
+kernel::module_platform_driver! {
+ type: SampleDriver,
+ name: "rust_driver_platform",
+ authors: ["Danilo Krummrich"],
+ description: "Rust Platform driver",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_driver_usb.rs b/samples/rust/rust_driver_usb.rs
new file mode 100644
index 000000000000..4eaad14867b2
--- /dev/null
+++ b/samples/rust/rust_driver_usb.rs
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+// SPDX-FileCopyrightText: Copyright (C) 2025 Collabora Ltd.
+
+//! Rust USB driver sample.
+
+use kernel::{device, device::Core, prelude::*, sync::aref::ARef, usb};
+
+struct SampleDriver {
+ _intf: ARef<usb::Interface>,
+}
+
+kernel::usb_device_table!(
+ USB_TABLE,
+ MODULE_USB_TABLE,
+ <SampleDriver as usb::Driver>::IdInfo,
+ [(usb::DeviceId::from_id(0x1234, 0x5678), ()),]
+);
+
+impl usb::Driver for SampleDriver {
+ type IdInfo = ();
+ const ID_TABLE: usb::IdTable<Self::IdInfo> = &USB_TABLE;
+
+ fn probe(
+ intf: &usb::Interface<Core>,
+ _id: &usb::DeviceId,
+ _info: &Self::IdInfo,
+ ) -> impl PinInit<Self, Error> {
+ let dev: &device::Device<Core> = intf.as_ref();
+ dev_info!(dev, "Rust USB driver sample probed\n");
+
+ Ok(Self { _intf: intf.into() })
+ }
+
+ fn disconnect(intf: &usb::Interface<Core>, _data: Pin<&Self>) {
+ let dev: &device::Device<Core> = intf.as_ref();
+ dev_info!(dev, "Rust USB driver sample disconnected\n");
+ }
+}
+
+kernel::module_usb_driver! {
+ type: SampleDriver,
+ name: "rust_driver_usb",
+ authors: ["Daniel Almeida"],
+ description: "Rust USB driver sample",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_i2c_client.rs b/samples/rust/rust_i2c_client.rs
new file mode 100644
index 000000000000..f67938396dce
--- /dev/null
+++ b/samples/rust/rust_i2c_client.rs
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust I2C client registration sample.
+//!
+//! An I2C client in Rust cannot exist on its own. To register a new I2C client,
+//! it must be bound to a parent device. In this sample driver, a platform device
+//! is used as the parent.
+//!
+
+//! ACPI match table test
+//!
+//! This demonstrates how to test an ACPI-based Rust I2C client registration driver
+//! using QEMU with a custom SSDT.
+//!
+//! Steps:
+//!
+//! 1. **Create an SSDT source file** (`ssdt.dsl`) with the following content:
+//!
+//! ```asl
+//! DefinitionBlock ("", "SSDT", 2, "TEST", "VIRTACPI", 0x00000001)
+//! {
+//! Scope (\_SB)
+//! {
+//! Device (T432)
+//! {
+//! Name (_HID, "LNUXBEEF") // ACPI hardware ID to match
+//! Name (_UID, 1)
+//! Name (_STA, 0x0F) // Device present, enabled
+//! Name (_CRS, ResourceTemplate ()
+//! {
+//! Memory32Fixed (ReadWrite, 0xFED00000, 0x1000)
+//! })
+//! }
+//! }
+//! }
+//! ```
+//!
+//! 2. **Compile the table**:
+//!
+//! ```sh
+//! iasl -tc ssdt.dsl
+//! ```
+//!
+//! This generates `ssdt.aml`
+//!
+//! 3. **Run QEMU** with the compiled AML file:
+//!
+//! ```sh
+//! qemu-system-x86_64 -m 512M \
+//! -enable-kvm \
+//! -kernel path/to/bzImage \
+//! -append "root=/dev/sda console=ttyS0" \
+//! -hda rootfs.img \
+//! -serial stdio \
+//! -acpitable file=ssdt.aml
+//! ```
+//!
+//! Requirements:
+//! - The `rust_driver_platform` must be present either:
+//! - built directly into the kernel (`bzImage`), or
+//! - available as a `.ko` file and loadable from `rootfs.img`
+//!
+//! 4. **Verify it worked** by checking `dmesg`:
+//!
+//! ```
+//! rust_driver_platform LNUXBEEF:00: Probed with info: '0'.
+//! ```
+//!
+
+use kernel::{
+ acpi,
+ c_str,
+ device,
+ devres::Devres,
+ i2c,
+ of,
+ platform,
+ prelude::*,
+ sync::aref::ARef, //
+};
+
+#[pin_data]
+struct SampleDriver {
+ parent_dev: ARef<platform::Device>,
+ #[pin]
+ _reg: Devres<i2c::Registration>,
+}
+
+kernel::of_device_table!(
+ OF_TABLE,
+ MODULE_OF_TABLE,
+ <SampleDriver as platform::Driver>::IdInfo,
+ [(of::DeviceId::new(c_str!("test,rust-device")), ())]
+);
+
+kernel::acpi_device_table!(
+ ACPI_TABLE,
+ MODULE_ACPI_TABLE,
+ <SampleDriver as platform::Driver>::IdInfo,
+ [(acpi::DeviceId::new(c_str!("LNUXBEEF")), ())]
+);
+
+const SAMPLE_I2C_CLIENT_ADDR: u16 = 0x30;
+const SAMPLE_I2C_ADAPTER_INDEX: i32 = 0;
+const BOARD_INFO: i2c::I2cBoardInfo =
+ i2c::I2cBoardInfo::new(c_str!("rust_driver_i2c"), SAMPLE_I2C_CLIENT_ADDR);
+
+impl platform::Driver for SampleDriver {
+ type IdInfo = ();
+ const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE);
+ const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = Some(&ACPI_TABLE);
+
+ fn probe(
+ pdev: &platform::Device<device::Core>,
+ _info: Option<&Self::IdInfo>,
+ ) -> impl PinInit<Self, Error> {
+ dev_info!(
+ pdev.as_ref(),
+ "Probe Rust I2C Client registration sample.\n"
+ );
+
+ kernel::try_pin_init!( Self {
+ parent_dev: pdev.into(),
+
+ _reg <- {
+ let adapter = i2c::I2cAdapter::get(SAMPLE_I2C_ADAPTER_INDEX)?;
+
+ i2c::Registration::new(&adapter, &BOARD_INFO, pdev.as_ref())
+ }
+ })
+ }
+
+ fn unbind(pdev: &platform::Device<device::Core>, _this: Pin<&Self>) {
+ dev_info!(
+ pdev.as_ref(),
+ "Unbind Rust I2C Client registration sample.\n"
+ );
+ }
+}
+
+kernel::module_platform_driver! {
+ type: SampleDriver,
+ name: "rust_device_i2c",
+ authors: ["Danilo Krummrich", "Igor Korotin"],
+ description: "Rust I2C client registration",
+ license: "GPL v2",
+}
diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs
new file mode 100644
index 000000000000..8eb9583571d7
--- /dev/null
+++ b/samples/rust/rust_minimal.rs
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust minimal sample.
+
+use kernel::prelude::*;
+
+module! {
+ type: RustMinimal,
+ name: "rust_minimal",
+ authors: ["Rust for Linux Contributors"],
+ description: "Rust minimal sample",
+ license: "GPL",
+ params: {
+ test_parameter: i64 {
+ default: 1,
+ description: "This parameter has a default of 1",
+ },
+ },
+}
+
+struct RustMinimal {
+ numbers: KVec<i32>,
+}
+
+impl kernel::Module for RustMinimal {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ pr_info!("Rust minimal sample (init)\n");
+ pr_info!("Am I built-in? {}\n", !cfg!(MODULE));
+ pr_info!(
+ "test_parameter: {}\n",
+ *module_parameters::test_parameter.value()
+ );
+
+ let mut numbers = KVec::new();
+ numbers.push(72, GFP_KERNEL)?;
+ numbers.push(108, GFP_KERNEL)?;
+ numbers.push(200, GFP_KERNEL)?;
+
+ Ok(RustMinimal { numbers })
+ }
+}
+
+impl Drop for RustMinimal {
+ fn drop(&mut self) {
+ pr_info!("My numbers are {:?}\n", self.numbers);
+ pr_info!("Rust minimal sample (exit)\n");
+ }
+}
diff --git a/samples/rust/rust_misc_device.rs b/samples/rust/rust_misc_device.rs
new file mode 100644
index 000000000000..d69bc33dbd99
--- /dev/null
+++ b/samples/rust/rust_misc_device.rs
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Rust misc device sample.
+//!
+//! Below is an example userspace C program that exercises this sample's functionality.
+//!
+//! ```c
+//! #include <stdio.h>
+//! #include <stdlib.h>
+//! #include <errno.h>
+//! #include <fcntl.h>
+//! #include <unistd.h>
+//! #include <sys/ioctl.h>
+//!
+//! #define RUST_MISC_DEV_FAIL _IO('|', 0)
+//! #define RUST_MISC_DEV_HELLO _IO('|', 0x80)
+//! #define RUST_MISC_DEV_GET_VALUE _IOR('|', 0x81, int)
+//! #define RUST_MISC_DEV_SET_VALUE _IOW('|', 0x82, int)
+//!
+//! int main() {
+//! int value, new_value;
+//! int fd, ret;
+//!
+//! // Open the device file
+//! printf("Opening /dev/rust-misc-device for reading and writing\n");
+//! fd = open("/dev/rust-misc-device", O_RDWR);
+//! if (fd < 0) {
+//! perror("open");
+//! return errno;
+//! }
+//!
+//! // Make call into driver to say "hello"
+//! printf("Calling Hello\n");
+//! ret = ioctl(fd, RUST_MISC_DEV_HELLO, NULL);
+//! if (ret < 0) {
+//! perror("ioctl: Failed to call into Hello");
+//! close(fd);
+//! return errno;
+//! }
+//!
+//! // Get initial value
+//! printf("Fetching initial value\n");
+//! ret = ioctl(fd, RUST_MISC_DEV_GET_VALUE, &value);
+//! if (ret < 0) {
+//! perror("ioctl: Failed to fetch the initial value");
+//! close(fd);
+//! return errno;
+//! }
+//!
+//! value++;
+//!
+//! // Set value to something different
+//! printf("Submitting new value (%d)\n", value);
+//! ret = ioctl(fd, RUST_MISC_DEV_SET_VALUE, &value);
+//! if (ret < 0) {
+//! perror("ioctl: Failed to submit new value");
+//! close(fd);
+//! return errno;
+//! }
+//!
+//! // Ensure new value was applied
+//! printf("Fetching new value\n");
+//! ret = ioctl(fd, RUST_MISC_DEV_GET_VALUE, &new_value);
+//! if (ret < 0) {
+//! perror("ioctl: Failed to fetch the new value");
+//! close(fd);
+//! return errno;
+//! }
+//!
+//! if (value != new_value) {
+//! printf("Failed: Committed and retrieved values are different (%d - %d)\n", value, new_value);
+//! close(fd);
+//! return -1;
+//! }
+//!
+//! // Call the unsuccessful ioctl
+//! printf("Attempting to call in to an non-existent IOCTL\n");
+//! ret = ioctl(fd, RUST_MISC_DEV_FAIL, NULL);
+//! if (ret < 0) {
+//! perror("ioctl: Succeeded to fail - this was expected");
+//! } else {
+//! printf("ioctl: Failed to fail\n");
+//! close(fd);
+//! return -1;
+//! }
+//!
+//! // Close the device file
+//! printf("Closing /dev/rust-misc-device\n");
+//! close(fd);
+//!
+//! printf("Success\n");
+//! return 0;
+//! }
+//! ```
+
+use core::pin::Pin;
+
+use kernel::{
+ c_str,
+ device::Device,
+ fs::{File, Kiocb},
+ ioctl::{_IO, _IOC_SIZE, _IOR, _IOW},
+ iov::{IovIterDest, IovIterSource},
+ miscdevice::{MiscDevice, MiscDeviceOptions, MiscDeviceRegistration},
+ new_mutex,
+ prelude::*,
+ sync::{aref::ARef, Mutex},
+ uaccess::{UserSlice, UserSliceReader, UserSliceWriter},
+};
+
+const RUST_MISC_DEV_HELLO: u32 = _IO('|' as u32, 0x80);
+const RUST_MISC_DEV_GET_VALUE: u32 = _IOR::<i32>('|' as u32, 0x81);
+const RUST_MISC_DEV_SET_VALUE: u32 = _IOW::<i32>('|' as u32, 0x82);
+
+module! {
+ type: RustMiscDeviceModule,
+ name: "rust_misc_device",
+ authors: ["Lee Jones"],
+ description: "Rust misc device sample",
+ license: "GPL",
+}
+
+#[pin_data]
+struct RustMiscDeviceModule {
+ #[pin]
+ _miscdev: MiscDeviceRegistration<RustMiscDevice>,
+}
+
+impl kernel::InPlaceModule for RustMiscDeviceModule {
+ fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
+ pr_info!("Initialising Rust Misc Device Sample\n");
+
+ let options = MiscDeviceOptions {
+ name: c_str!("rust-misc-device"),
+ };
+
+ try_pin_init!(Self {
+ _miscdev <- MiscDeviceRegistration::register(options),
+ })
+ }
+}
+
+struct Inner {
+ value: i32,
+ buffer: KVVec<u8>,
+}
+
+#[pin_data(PinnedDrop)]
+struct RustMiscDevice {
+ #[pin]
+ inner: Mutex<Inner>,
+ dev: ARef<Device>,
+}
+
+#[vtable]
+impl MiscDevice for RustMiscDevice {
+ type Ptr = Pin<KBox<Self>>;
+
+ fn open(_file: &File, misc: &MiscDeviceRegistration<Self>) -> Result<Pin<KBox<Self>>> {
+ let dev = ARef::from(misc.device());
+
+ dev_info!(dev, "Opening Rust Misc Device Sample\n");
+
+ KBox::try_pin_init(
+ try_pin_init! {
+ RustMiscDevice {
+ inner <- new_mutex!(Inner {
+ value: 0_i32,
+ buffer: KVVec::new(),
+ }),
+ dev: dev,
+ }
+ },
+ GFP_KERNEL,
+ )
+ }
+
+ fn read_iter(mut kiocb: Kiocb<'_, Self::Ptr>, iov: &mut IovIterDest<'_>) -> Result<usize> {
+ let me = kiocb.file();
+ dev_info!(me.dev, "Reading from Rust Misc Device Sample\n");
+
+ let inner = me.inner.lock();
+ // Read the buffer contents, taking the file position into account.
+ let read = iov.simple_read_from_buffer(kiocb.ki_pos_mut(), &inner.buffer)?;
+
+ Ok(read)
+ }
+
+ fn write_iter(mut kiocb: Kiocb<'_, Self::Ptr>, iov: &mut IovIterSource<'_>) -> Result<usize> {
+ let me = kiocb.file();
+ dev_info!(me.dev, "Writing to Rust Misc Device Sample\n");
+
+ let mut inner = me.inner.lock();
+
+ // Replace buffer contents.
+ inner.buffer.clear();
+ let len = iov.copy_from_iter_vec(&mut inner.buffer, GFP_KERNEL)?;
+
+ // Set position to zero so that future `read` calls will see the new contents.
+ *kiocb.ki_pos_mut() = 0;
+
+ Ok(len)
+ }
+
+ fn ioctl(me: Pin<&RustMiscDevice>, _file: &File, cmd: u32, arg: usize) -> Result<isize> {
+ dev_info!(me.dev, "IOCTLing Rust Misc Device Sample\n");
+
+ // Treat the ioctl argument as a user pointer.
+ let arg = UserPtr::from_addr(arg);
+ let size = _IOC_SIZE(cmd);
+
+ match cmd {
+ RUST_MISC_DEV_GET_VALUE => me.get_value(UserSlice::new(arg, size).writer())?,
+ RUST_MISC_DEV_SET_VALUE => me.set_value(UserSlice::new(arg, size).reader())?,
+ RUST_MISC_DEV_HELLO => me.hello()?,
+ _ => {
+ dev_err!(me.dev, "-> IOCTL not recognised: {}\n", cmd);
+ return Err(ENOTTY);
+ }
+ };
+
+ Ok(0)
+ }
+}
+
+#[pinned_drop]
+impl PinnedDrop for RustMiscDevice {
+ fn drop(self: Pin<&mut Self>) {
+ dev_info!(self.dev, "Exiting the Rust Misc Device Sample\n");
+ }
+}
+
+impl RustMiscDevice {
+ fn set_value(&self, mut reader: UserSliceReader) -> Result<isize> {
+ let new_value = reader.read::<i32>()?;
+ let mut guard = self.inner.lock();
+
+ dev_info!(
+ self.dev,
+ "-> Copying data from userspace (value: {})\n",
+ new_value
+ );
+
+ guard.value = new_value;
+ Ok(0)
+ }
+
+ fn get_value(&self, mut writer: UserSliceWriter) -> Result<isize> {
+ let guard = self.inner.lock();
+ let value = guard.value;
+
+ // Free-up the lock and use our locally cached instance from here
+ drop(guard);
+
+ dev_info!(
+ self.dev,
+ "-> Copying data to userspace (value: {})\n",
+ &value
+ );
+
+ writer.write::<i32>(&value)?;
+ Ok(0)
+ }
+
+ fn hello(&self) -> Result<isize> {
+ dev_info!(self.dev, "-> Hello from the Rust Misc Device\n");
+
+ Ok(0)
+ }
+}
diff --git a/samples/rust/rust_print_events.c b/samples/rust/rust_print_events.c
new file mode 100644
index 000000000000..a9169ff0edf1
--- /dev/null
+++ b/samples/rust/rust_print_events.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Google LLC
+ */
+
+#define CREATE_TRACE_POINTS
+#define CREATE_RUST_TRACE_POINTS
+#include <trace/events/rust_sample.h>
diff --git a/samples/rust/rust_print_main.rs b/samples/rust/rust_print_main.rs
new file mode 100644
index 000000000000..4095c72afeab
--- /dev/null
+++ b/samples/rust/rust_print_main.rs
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust printing macros sample.
+
+use kernel::pr_cont;
+use kernel::prelude::*;
+
+module! {
+ type: RustPrint,
+ name: "rust_print",
+ authors: ["Rust for Linux Contributors"],
+ description: "Rust printing macros sample",
+ license: "GPL",
+}
+
+struct RustPrint;
+
+#[expect(clippy::disallowed_macros)]
+fn arc_print() -> Result {
+ use kernel::sync::*;
+
+ let a = Arc::new(1, GFP_KERNEL)?;
+ let b = UniqueArc::new("hello, world", GFP_KERNEL)?;
+
+ // Prints the value of data in `a`.
+ pr_info!("{}", a);
+
+ // Uses ":?" to print debug fmt of `b`.
+ pr_info!("{:?}", b);
+
+ let a: Arc<&str> = b.into();
+ let c = a.clone();
+
+ // Uses `dbg` to print, will move `c` (for temporary debugging purposes).
+ dbg!(c);
+
+ {
+ // `Arc` can be used to delegate dynamic dispatch and the following is an example.
+ // Both `i32` and `&str` implement `Display`. This enables us to express a unified
+ // behaviour, contract or protocol on both `i32` and `&str` into a single `Arc` of
+ // type `Arc<dyn Display>`.
+
+ use kernel::fmt::Display;
+ fn arc_dyn_print(arc: &Arc<dyn Display>) {
+ pr_info!("Arc<dyn Display> says {arc}");
+ }
+
+ let a_i32_display: Arc<dyn Display> = Arc::new(42i32, GFP_KERNEL)?;
+ let a_str_display: Arc<dyn Display> = a.clone();
+
+ arc_dyn_print(&a_i32_display);
+ arc_dyn_print(&a_str_display);
+ }
+
+ // Pretty-prints the debug formatting with lower-case hexadecimal integers.
+ pr_info!("{:#x?}", a);
+
+ Ok(())
+}
+
+impl kernel::Module for RustPrint {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ pr_info!("Rust printing macros sample (init)\n");
+
+ pr_emerg!("Emergency message (level 0) without args\n");
+ pr_alert!("Alert message (level 1) without args\n");
+ pr_crit!("Critical message (level 2) without args\n");
+ pr_err!("Error message (level 3) without args\n");
+ pr_warn!("Warning message (level 4) without args\n");
+ pr_notice!("Notice message (level 5) without args\n");
+ pr_info!("Info message (level 6) without args\n");
+
+ pr_info!("A line that");
+ pr_cont!(" is continued");
+ pr_cont!(" without args\n");
+
+ pr_emerg!("{} message (level {}) with args\n", "Emergency", 0);
+ pr_alert!("{} message (level {}) with args\n", "Alert", 1);
+ pr_crit!("{} message (level {}) with args\n", "Critical", 2);
+ pr_err!("{} message (level {}) with args\n", "Error", 3);
+ pr_warn!("{} message (level {}) with args\n", "Warning", 4);
+ pr_notice!("{} message (level {}) with args\n", "Notice", 5);
+ pr_info!("{} message (level {}) with args\n", "Info", 6);
+
+ pr_info!("A {} that", "line");
+ pr_cont!(" is {}", "continued");
+ pr_cont!(" with {}\n", "args");
+
+ arc_print()?;
+
+ trace::trace_rust_sample_loaded(42);
+
+ Ok(RustPrint)
+ }
+}
+
+impl Drop for RustPrint {
+ fn drop(&mut self) {
+ pr_info!("Rust printing macros sample (exit)\n");
+ }
+}
+
+mod trace {
+ use kernel::ffi::c_int;
+
+ kernel::declare_trace! {
+ /// # Safety
+ ///
+ /// Always safe to call.
+ unsafe fn rust_sample_loaded(magic: c_int);
+ }
+
+ pub(crate) fn trace_rust_sample_loaded(magic: i32) {
+ // SAFETY: Always safe to call.
+ unsafe { rust_sample_loaded(magic as c_int) }
+ }
+}
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
index 78fb78184291..a6df0da77c5d 100644
--- a/samples/seccomp/.gitignore
+++ b/samples/seccomp/.gitignore
@@ -1,3 +1,5 @@
-bpf-direct
-bpf-fancy
-dropper
+# SPDX-License-Identifier: GPL-2.0-only
+/bpf-direct
+/bpf-fancy
+/dropper
+/user-trap
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index 7203e66dcd6f..c85ae0ed8342 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,42 +1,6 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
-hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct
-
-HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
-HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
-HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include
-HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include
bpf-fancy-objs := bpf-fancy.o bpf-helper.o
-HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include
-HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include
-dropper-objs := dropper.o
-
-HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
-HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
-bpf-direct-objs := bpf-direct.o
-
-# Try to match the kernel target.
-ifndef CONFIG_64BIT
-ifndef CROSS_COMPILE
-
-# s390 has -m31 flag to build 31 bit binaries
-ifndef CONFIG_S390
-MFLAG = -m32
-else
-MFLAG = -m31
-endif
-
-HOSTCFLAGS_bpf-direct.o += $(MFLAG)
-HOSTCFLAGS_dropper.o += $(MFLAG)
-HOSTCFLAGS_bpf-helper.o += $(MFLAG)
-HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
-HOSTLOADLIBES_bpf-direct += $(MFLAG)
-HOSTLOADLIBES_bpf-fancy += $(MFLAG)
-HOSTLOADLIBES_dropper += $(MFLAG)
-endif
-endif
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
+userccflags += -I usr/include
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
index 151ec3f52189..c09e4a17ac1a 100644
--- a/samples/seccomp/bpf-direct.c
+++ b/samples/seccomp/bpf-direct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
*
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
index 8eb483aaec46..1ccb435025b6 100644
--- a/samples/seccomp/bpf-fancy.c
+++ b/samples/seccomp/bpf-fancy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Seccomp BPF example using a macro-based generator.
*
@@ -25,7 +26,9 @@
int main(int argc, char **argv)
{
- struct bpf_labels l;
+ struct bpf_labels l = {
+ .count = 0,
+ };
static const char msg1[] = "Please type something: ";
static const char msg2[] = "You typed: ";
char buf[256];
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
index 579cfe331886..ae260d77a868 100644
--- a/samples/seccomp/bpf-helper.c
+++ b/samples/seccomp/bpf-helper.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Seccomp BPF helper functions
*
@@ -10,6 +11,7 @@
*/
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include "bpf-helper.h"
@@ -17,41 +19,41 @@
int bpf_resolve_jumps(struct bpf_labels *labels,
struct sock_filter *filter, size_t count)
{
- struct sock_filter *begin = filter;
- __u8 insn = count - 1;
+ size_t i;
- if (count < 1)
+ if (count < 1 || count > BPF_MAXINSNS)
return -1;
/*
* Walk it once, backwards, to build the label table and do fixups.
* Since backward jumps are disallowed by BPF, this is easy.
*/
- filter += insn;
- for (; filter >= begin; --insn, --filter) {
- if (filter->code != (BPF_JMP+BPF_JA))
+ for (i = 0; i < count; ++i) {
+ size_t offset = count - i - 1;
+ struct sock_filter *instr = &filter[offset];
+ if (instr->code != (BPF_JMP+BPF_JA))
continue;
- switch ((filter->jt<<8)|filter->jf) {
+ switch ((instr->jt<<8)|instr->jf) {
case (JUMP_JT<<8)|JUMP_JF:
- if (labels->labels[filter->k].location == 0xffffffff) {
+ if (labels->labels[instr->k].location == 0xffffffff) {
fprintf(stderr, "Unresolved label: '%s'\n",
- labels->labels[filter->k].label);
+ labels->labels[instr->k].label);
return 1;
}
- filter->k = labels->labels[filter->k].location -
- (insn + 1);
- filter->jt = 0;
- filter->jf = 0;
+ instr->k = labels->labels[instr->k].location -
+ (offset + 1);
+ instr->jt = 0;
+ instr->jf = 0;
continue;
case (LABEL_JT<<8)|LABEL_JF:
- if (labels->labels[filter->k].location != 0xffffffff) {
+ if (labels->labels[instr->k].location != 0xffffffff) {
fprintf(stderr, "Duplicate label use: '%s'\n",
- labels->labels[filter->k].label);
+ labels->labels[instr->k].label);
return 1;
}
- labels->labels[filter->k].location = insn;
- filter->k = 0; /* fall through */
- filter->jt = 0;
- filter->jf = 0;
+ labels->labels[instr->k].location = offset;
+ instr->k = 0; /* fall through */
+ instr->jt = 0;
+ instr->jf = 0;
continue;
}
}
@@ -63,6 +65,11 @@ __u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
{
struct __bpf_label *begin = labels->labels, *end;
int id;
+
+ if (labels->count == BPF_LABELS_MAX) {
+ fprintf(stderr, "Too many labels\n");
+ exit(1);
+ }
if (labels->count == 0) {
begin->label = label;
begin->location = 0xffffffff;
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
index 38ee70f3cd5b..417e48a4c4df 100644
--- a/samples/seccomp/bpf-helper.h
+++ b/samples/seccomp/bpf-helper.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Example wrapper around BPF macros.
*
@@ -61,9 +62,9 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count);
#define EXPAND(...) __VA_ARGS__
/* Ensure that we load the logically correct offset. */
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
-#elif __BYTE_ORDER == __BIG_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
#else
#error "Unknown endianness"
@@ -84,10 +85,10 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count);
#elif __BITS_PER_LONG == 64
/* Ensure that we load the logically correct offset. */
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define ENDIAN(_lo, _hi) _lo, _hi
#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
-#elif __BYTE_ORDER == __BIG_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define ENDIAN(_lo, _hi) _hi, _lo
#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
#endif
@@ -138,7 +139,7 @@ union arg64 {
#define ARG_32(idx) \
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
-/* Loads hi into A and lo in X */
+/* Loads lo into M[0] and hi into M[1] and A */
#define ARG_64(idx) \
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
@@ -153,88 +154,107 @@ union arg64 {
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
jt
-/* Checks the lo, then swaps to check the hi. A=lo,X=hi */
+#define JA32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+ jt
+
+#define JGE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+ jt
+
+#define JGT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+ jt
+
+#define JLE32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+ jt
+
+#define JLT32(value, jt) \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+ jt
+
+/*
+ * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both
+ * A and M[1]. This invariant is kept by restoring A if necessary.
+ */
#define JEQ64(lo, hi, jt) \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ /* if (lo != arg.lo) goto NOMATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
#define JNE64(lo, hi, jt) \
- BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ /* if (hi != arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo != arg.lo) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
-
-#define JA32(value, jt) \
- BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
- jt
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
#define JA64(lo, hi, jt) \
+ /* if (hi & arg.hi) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo & arg.lo) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
-#define JGE32(value, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
- jt
-
-#define JLT32(value, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
- jt
-
-/* Shortcut checking if hi > arg.hi. */
#define JGE64(lo, hi, jt) \
+ /* if (hi > arg.hi) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo >= arg.lo) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
- jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
-
-#define JLT64(lo, hi, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
- BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
- BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
-#define JGT32(value, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
- jt
-
-#define JLE32(value, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
- jt
-
-/* Check hi > args.hi first, then do the GE checking */
#define JGT64(lo, hi, jt) \
+ /* if (hi > arg.hi) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo > arg.lo) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
#define JLE64(lo, hi, jt) \
- BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0), \
- BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
- BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+ /* if (hi < arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo <= arg.lo) goto MATCH; */ \
BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
- BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
+ jt, \
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
+
+#define JLT64(lo, hi, jt) \
+ /* if (hi < arg.hi) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+ /* if (hi != arg.hi) goto NOMATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+ BPF_STMT(BPF_LD+BPF_MEM, 0), \
+ /* if (lo < arg.lo) goto MATCH; */ \
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \
+ BPF_STMT(BPF_LD+BPF_MEM, 1), \
jt, \
- BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+ BPF_STMT(BPF_LD+BPF_MEM, 1)
#define LOAD_SYSCALL_NR \
BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
index c69c347c7011..4bca4b70f665 100644
--- a/samples/seccomp/dropper.c
+++ b/samples/seccomp/dropper.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Naive system call dropper built on seccomp_filter.
*
@@ -11,7 +12,6 @@
* When run, returns the specified errno for the specified
* system call number against the given architecture.
*
- * Run this one as root as PR_SET_NO_NEW_PRIVS is not called.
*/
#include <errno.h>
@@ -25,7 +25,7 @@
#include <sys/prctl.h>
#include <unistd.h>
-static int install_filter(int nr, int arch, int error)
+static int install_filter(int arch, int nr, int error)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
@@ -42,8 +42,16 @@ static int install_filter(int nr, int arch, int error)
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
+ if (error == -1) {
+ struct sock_filter kill = BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL);
+ filter[4] = kill;
+ }
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl(NO_NEW_PRIVS)");
+ return 1;
+ }
if (prctl(PR_SET_SECCOMP, 2, &prog)) {
- perror("prctl");
+ perror("prctl(PR_SET_SECCOMP)");
return 1;
}
return 0;
@@ -53,9 +61,10 @@ int main(int argc, char **argv)
{
if (argc < 5) {
fprintf(stderr, "Usage:\n"
- "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
+ "dropper <arch> <syscall_nr> <errno> <prog> [<args>]\n"
"Hint: AUDIT_ARCH_I386: 0x%X\n"
" AUDIT_ARCH_X86_64: 0x%X\n"
+ " errno == -1 means SECCOMP_RET_KILL\n"
"\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
return 1;
}
diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
new file mode 100644
index 000000000000..a23fec357b5d
--- /dev/null
+++ b/samples/seccomp/user-trap.c
@@ -0,0 +1,379 @@
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stddef.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <sys/ioctl.h>
+#include <sys/ptrace.h>
+#include <sys/mount.h>
+#include <linux/limits.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+ errno = 0;
+ return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int send_fd(int sock, int fd)
+{
+ struct msghdr msg = {};
+ struct cmsghdr *cmsg;
+ int *fd_ptr;
+ char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
+ struct iovec io = {
+ .iov_base = &c,
+ .iov_len = 1,
+ };
+
+ msg.msg_iov = &io;
+ msg.msg_iovlen = 1;
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ fd_ptr = (int *)CMSG_DATA(cmsg);
+ *fd_ptr = fd;
+ msg.msg_controllen = cmsg->cmsg_len;
+
+ if (sendmsg(sock, &msg, 0) < 0) {
+ perror("sendmsg");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int recv_fd(int sock)
+{
+ struct msghdr msg = {};
+ struct cmsghdr *cmsg;
+ int *fd_ptr;
+ char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
+ struct iovec io = {
+ .iov_base = &c,
+ .iov_len = 1,
+ };
+
+ msg.msg_iov = &io;
+ msg.msg_iovlen = 1;
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+
+ if (recvmsg(sock, &msg, 0) < 0) {
+ perror("recvmsg");
+ return -1;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ fd_ptr = (int *)CMSG_DATA(cmsg);
+
+ return *fd_ptr;
+}
+
+static int user_trap_syscall(int nr, unsigned int flags)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+static int handle_req(struct seccomp_notif *req,
+ struct seccomp_notif_resp *resp, int listener)
+{
+ char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
+ int ret = -1, mem;
+
+ resp->id = req->id;
+ resp->error = -EPERM;
+ resp->val = 0;
+
+ if (req->data.nr != __NR_mount) {
+ fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
+ return -1;
+ }
+
+ /* Only allow bind mounts. */
+ if (!(req->data.args[3] & MS_BIND))
+ return 0;
+
+ /*
+ * Ok, let's read the task's memory to see where they wanted their
+ * mount to go.
+ */
+ snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
+ mem = open(path, O_RDONLY);
+ if (mem < 0) {
+ perror("open mem");
+ return -1;
+ }
+
+ /*
+ * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
+ * the pid that made the syscall may have died, we need to confirm that
+ * the pid is still valid after we open its /proc/pid/mem file. We can
+ * ask the listener fd this as follows.
+ *
+ * Note that this check should occur *after* any task-specific
+ * resources are opened, to make sure that the task has not died and
+ * we're not wrongly reading someone else's state in order to make
+ * decisions.
+ */
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
+ fprintf(stderr, "task died before we could map its memory\n");
+ goto out;
+ }
+
+ /*
+ * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
+ * that to avoid another TOCTOU, we should read all of the pointer args
+ * before we decide to allow the syscall.
+ */
+ if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
+ perror("seek");
+ goto out;
+ }
+
+ ret = read(mem, source, sizeof(source));
+ if (ret < 0) {
+ perror("read");
+ goto out;
+ }
+
+ if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
+ perror("seek");
+ goto out;
+ }
+
+ ret = read(mem, target, sizeof(target));
+ if (ret < 0) {
+ perror("read");
+ goto out;
+ }
+
+ /*
+ * Our policy is to only allow bind mounts inside /tmp. This isn't very
+ * interesting, because we could do unprivlieged bind mounts with user
+ * namespaces already, but you get the idea.
+ */
+ if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
+ if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
+ ret = -1;
+ perror("actual mount");
+ goto out;
+ }
+ resp->error = 0;
+ }
+
+ /* Even if we didn't allow it because of policy, generating the
+ * response was be a success, because we want to tell the worker EPERM.
+ */
+ ret = 0;
+
+out:
+ close(mem);
+ return ret;
+}
+
+int main(void)
+{
+ int sk_pair[2], ret = 1, status, listener;
+ pid_t worker = 0 , tracer = 0;
+
+ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
+ perror("socketpair");
+ return 1;
+ }
+
+ worker = fork();
+ if (worker < 0) {
+ perror("fork");
+ goto close_pair;
+ }
+
+ if (worker == 0) {
+ listener = user_trap_syscall(__NR_mount,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ if (listener < 0) {
+ perror("seccomp");
+ exit(1);
+ }
+
+ /*
+ * Drop privileges. We definitely can't mount as uid 1000.
+ */
+ if (setuid(1000) < 0) {
+ perror("setuid");
+ exit(1);
+ }
+
+ /*
+ * Send the listener to the parent; also serves as
+ * synchronization.
+ */
+ if (send_fd(sk_pair[1], listener) < 0)
+ exit(1);
+ close(listener);
+
+ if (mkdir("/tmp/foo", 0755) < 0) {
+ perror("mkdir");
+ exit(1);
+ }
+
+ /*
+ * Try a bad mount just for grins.
+ */
+ if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
+ fprintf(stderr, "huh? mounted /dev/sda?\n");
+ exit(1);
+ }
+
+ if (errno != EPERM) {
+ perror("bad error from mount");
+ exit(1);
+ }
+
+ /*
+ * Ok, we expect this one to succeed.
+ */
+ if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
+ perror("mount");
+ exit(1);
+ }
+
+ exit(0);
+ }
+
+ /*
+ * Get the listener from the child.
+ */
+ listener = recv_fd(sk_pair[0]);
+ if (listener < 0)
+ goto out_kill;
+
+ /*
+ * Fork a task to handle the requests. This isn't strictly necessary,
+ * but it makes the particular writing of this sample easier, since we
+ * can just wait ofr the tracee to exit and kill the tracer.
+ */
+ tracer = fork();
+ if (tracer < 0) {
+ perror("fork");
+ goto out_kill;
+ }
+
+ if (tracer == 0) {
+ struct seccomp_notif *req;
+ struct seccomp_notif_resp *resp;
+ struct seccomp_notif_sizes sizes;
+
+ if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
+ perror("seccomp(GET_NOTIF_SIZES)");
+ goto out_close;
+ }
+
+ req = malloc(sizes.seccomp_notif);
+ if (!req)
+ goto out_close;
+
+ resp = malloc(sizes.seccomp_notif_resp);
+ if (!resp)
+ goto out_req;
+ memset(resp, 0, sizes.seccomp_notif_resp);
+
+ while (1) {
+ memset(req, 0, sizes.seccomp_notif);
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
+ perror("ioctl recv");
+ goto out_resp;
+ }
+
+ if (handle_req(req, resp, listener) < 0)
+ goto out_resp;
+
+ /*
+ * ENOENT here means that the task may have gotten a
+ * signal and restarted the syscall. It's up to the
+ * handler to decide what to do in this case, but for
+ * the sample code, we just ignore it. Probably
+ * something better should happen, like undoing the
+ * mount, or keeping track of the args to make sure we
+ * don't do it again.
+ */
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
+ errno != ENOENT) {
+ perror("ioctl send");
+ goto out_resp;
+ }
+ }
+out_resp:
+ free(resp);
+out_req:
+ free(req);
+out_close:
+ close(listener);
+ exit(1);
+ }
+
+ close(listener);
+
+ if (waitpid(worker, &status, 0) != worker) {
+ perror("waitpid");
+ goto out_kill;
+ }
+
+ if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
+ perror("umount2");
+ goto out_kill;
+ }
+
+ if (remove("/tmp/foo") < 0 && errno != ENOENT) {
+ perror("remove");
+ exit(1);
+ }
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+ fprintf(stderr, "worker exited nonzero\n");
+ goto out_kill;
+ }
+
+ ret = 0;
+
+out_kill:
+ if (tracer > 0)
+ kill(tracer, SIGKILL);
+ if (worker > 0)
+ kill(worker, SIGKILL);
+
+close_pair:
+ close(sk_pair[0]);
+ close(sk_pair[1]);
+ return ret;
+}
diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore
new file mode 100644
index 000000000000..cd9ff7b95383
--- /dev/null
+++ b/samples/timers/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/hpet_example
diff --git a/samples/timers/Makefile b/samples/timers/Makefile
new file mode 100644
index 000000000000..e6836cdea4e2
--- /dev/null
+++ b/samples/timers/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += hpet_example
+
+userccflags += -I usr/include
diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c
new file mode 100644
index 000000000000..f1cb622f6ec0
--- /dev/null
+++ b/samples/timers/hpet_example.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <memory.h>
+#include <malloc.h>
+#include <time.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <linux/hpet.h>
+
+
+extern void hpet_open_close(int, const char **);
+extern void hpet_info(int, const char **);
+extern void hpet_poll(int, const char **);
+extern void hpet_fasync(int, const char **);
+extern void hpet_read(int, const char **);
+
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+
+struct hpet_command {
+ char *command;
+ void (*func)(int argc, const char ** argv);
+} hpet_command[] = {
+ {
+ "open-close",
+ hpet_open_close
+ },
+ {
+ "info",
+ hpet_info
+ },
+ {
+ "poll",
+ hpet_poll
+ },
+ {
+ "fasync",
+ hpet_fasync
+ },
+};
+
+int
+main(int argc, const char ** argv)
+{
+ unsigned int i;
+
+ argc--;
+ argv++;
+
+ if (!argc) {
+ fprintf(stderr, "-hpet: requires command\n");
+ return -1;
+ }
+
+
+ for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
+ if (!strcmp(argv[0], hpet_command[i].command)) {
+ argc--;
+ argv++;
+ fprintf(stderr, "-hpet: executing %s\n",
+ hpet_command[i].command);
+ hpet_command[i].func(argc, argv);
+ return 0;
+ }
+
+ fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
+
+ return -1;
+}
+
+void
+hpet_open_close(int argc, const char **argv)
+{
+ int fd;
+
+ if (argc != 1) {
+ fprintf(stderr, "hpet_open_close: device-name\n");
+ return;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+ if (fd < 0)
+ fprintf(stderr, "hpet_open_close: open failed\n");
+ else
+ close(fd);
+
+ return;
+}
+
+void
+hpet_info(int argc, const char **argv)
+{
+ struct hpet_info info;
+ int fd;
+
+ if (argc != 1) {
+ fprintf(stderr, "hpet_info: device-name\n");
+ return;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]);
+ return;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_info: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ",
+ info.hi_ireqfreq, info.hi_flags);
+ fprintf(stderr, "hi_hpet %d hi_timer %d\n",
+ info.hi_hpet, info.hi_timer);
+
+out:
+ close(fd);
+ return;
+}
+
+void
+hpet_poll(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd;
+ struct pollfd pfd;
+ struct hpet_info info;
+ struct timeval stv, etv;
+ struct timezone tz;
+ long usec;
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_poll: device-name freq iterations\n");
+ return;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
+ return;
+ }
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_poll: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ pfd.fd = fd;
+ pfd.events = POLLIN;
+
+ for (i = 0; i < iterations; i++) {
+ pfd.revents = 0;
+ gettimeofday(&stv, &tz);
+ if (poll(&pfd, 1, -1) < 0)
+ fprintf(stderr, "hpet_poll: poll failed\n");
+ else {
+ long data;
+
+ gettimeofday(&etv, &tz);
+ usec = stv.tv_sec * 1000000 + stv.tv_usec;
+ usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
+
+ fprintf(stderr,
+ "hpet_poll: expired time = 0x%lx\n", usec);
+
+ fprintf(stderr, "hpet_poll: revents = 0x%x\n",
+ pfd.revents);
+
+ if (read(fd, &data, sizeof(data)) != sizeof(data)) {
+ fprintf(stderr, "hpet_poll: read failed\n");
+ }
+ else
+ fprintf(stderr, "hpet_poll: data 0x%lx\n",
+ data);
+ }
+ }
+
+out:
+ close(fd);
+ return;
+}
+
+static int hpet_sigio_count;
+
+static void
+hpet_sigio(int val)
+{
+ fprintf(stderr, "hpet_sigio: called\n");
+ hpet_sigio_count++;
+}
+
+void
+hpet_fasync(int argc, const char **argv)
+{
+ unsigned long freq;
+ int iterations, i, fd, value;
+ sig_t oldsig;
+ struct hpet_info info;
+
+ hpet_sigio_count = 0;
+ fd = -1;
+
+ if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
+ fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
+ return;
+ }
+
+ if (argc != 3) {
+ fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
+ goto out;
+ }
+
+ fd = open(argv[0], O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
+ return;
+ }
+
+
+ if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
+ ((value = fcntl(fd, F_GETFL)) == 1) ||
+ (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
+ fprintf(stderr, "hpet_fasync: fcntl failed\n");
+ goto out;
+ }
+
+ freq = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+
+ if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
+ fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_INFO, &info) < 0) {
+ fprintf(stderr, "hpet_fasync: failed to get info\n");
+ goto out;
+ }
+
+ fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
+
+ if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
+ fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
+ goto out;
+ }
+
+ if (ioctl(fd, HPET_IE_ON, 0) < 0) {
+ fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
+ goto out;
+ }
+
+ for (i = 0; i < iterations; i++) {
+ (void) pause();
+ fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
+ }
+
+out:
+ signal(SIGIO, oldsig);
+
+ if (fd >= 0)
+ close(fd);
+
+ return;
+}
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
index 0f8d92120c4e..b3808bb4cf8b 100644
--- a/samples/trace_events/Makefile
+++ b/samples/trace_events/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
# builds the trace events example kernel modules;
# then to use one (as root): insmod <module_name.ko>
@@ -10,5 +11,7 @@
# Here trace-events-sample.c does the CREATE_TRACE_POINTS.
#
CFLAGS_trace-events-sample.o := -I$(src)
+CFLAGS_trace_custom_sched.o := -I$(src)
obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
+obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_custom_sched.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index aabc4e970911..ecc7db237f2e 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/kthread.h>
@@ -10,12 +11,51 @@
#define CREATE_TRACE_POINTS
#include "trace-events-sample.h"
+static const char *random_strings[] = {
+ "Mother Goose",
+ "Snoopy",
+ "Gandalf",
+ "Frodo",
+ "One ring to rule them all"
+};
-static void simple_thread_func(int cnt)
+static void do_simple_thread_func(int cnt, const char *fmt, ...)
{
+ unsigned long bitmask[1] = {0xdeadbeefUL};
+ va_list va;
+ int array[6];
+ int len = cnt % 5;
+ int i;
+
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
- trace_foo_bar("hello", cnt);
+
+ for (i = 0; i < len; i++)
+ array[i] = i + 1;
+ array[i] = 0;
+
+ va_start(va, fmt);
+
+ /* Silly tracepoints */
+ trace_foo_bar("hello", cnt, array, random_strings[len],
+ current->cpus_ptr, fmt, &va);
+
+ va_end(va);
+
+ trace_foo_with_template_simple("HELLO", cnt);
+
+ trace_foo_bar_with_cond("Some times print", cnt);
+
+ trace_foo_with_template_cond("prints other times", cnt);
+
+ trace_foo_with_template_print("I have to be different", cnt);
+
+ trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask, current->cpus_ptr);
+}
+
+static void simple_thread_func(int cnt)
+{
+ do_simple_thread_func(cnt, "iter=%d", cnt);
}
static int simple_thread(void *arg)
@@ -29,6 +69,62 @@ static int simple_thread(void *arg)
}
static struct task_struct *simple_tsk;
+static struct task_struct *simple_tsk_fn;
+
+static void simple_thread_func_fn(int cnt)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+
+ /* More silly tracepoints */
+ trace_foo_bar_with_fn("Look at me", cnt);
+ trace_foo_with_template_fn("Look at me too", cnt);
+}
+
+static int simple_thread_fn(void *arg)
+{
+ int cnt = 0;
+
+ while (!kthread_should_stop())
+ simple_thread_func_fn(cnt++);
+
+ return 0;
+}
+
+static DEFINE_MUTEX(thread_mutex);
+static int simple_thread_cnt;
+
+int foo_bar_reg(void)
+{
+ mutex_lock(&thread_mutex);
+ if (simple_thread_cnt++)
+ goto out;
+
+ pr_info("Starting thread for foo_bar_fn\n");
+ /*
+ * We shouldn't be able to start a trace when the module is
+ * unloading (there's other locks to prevent that). But
+ * for consistency sake, we still take the thread_mutex.
+ */
+ simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn");
+ out:
+ mutex_unlock(&thread_mutex);
+ return 0;
+}
+
+void foo_bar_unreg(void)
+{
+ mutex_lock(&thread_mutex);
+ if (--simple_thread_cnt)
+ goto out;
+
+ pr_info("Killing thread for foo_bar_fn\n");
+ if (simple_tsk_fn)
+ kthread_stop(simple_tsk_fn);
+ simple_tsk_fn = NULL;
+ out:
+ mutex_unlock(&thread_mutex);
+}
static int __init trace_event_init(void)
{
@@ -42,6 +138,11 @@ static int __init trace_event_init(void)
static void __exit trace_event_exit(void)
{
kthread_stop(simple_tsk);
+ mutex_lock(&thread_mutex);
+ if (simple_tsk_fn)
+ kthread_stop(simple_tsk_fn);
+ simple_tsk_fn = NULL;
+ mutex_unlock(&thread_mutex);
}
module_init(trace_event_init);
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 6af373236d73..1a05fc153353 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -1,17 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* If TRACE_SYSTEM is defined, that will be the directory created
- * in the ftrace directory under /sys/kernel/debug/tracing/events/<system>
+ * in the ftrace directory under /sys/kernel/tracing/events/<system>
*
* The define_trace.h below will also look for a file name of
* TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
- * In this case, it would look for sample.h
+ * In this case, it would look for sample-trace.h
*
* If the header name will be different than the system name
* (as in this case), then you can override the header name that
* define_trace.h will look up by defining TRACE_INCLUDE_FILE
*
* This file is called trace-events-sample.h but we want the system
- * to be called "sample". Therefore we must define the name of this
+ * to be called "sample-trace". Therefore we must define the name of this
* file:
*
* #define TRACE_INCLUDE_FILE trace-events-sample
@@ -22,7 +23,25 @@
* protection, just like TRACE_INCLUDE_FILE.
*/
#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sample
+#define TRACE_SYSTEM sample-trace
+
+/*
+ * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
+ * and underscore), although it may start with numbers. If for some
+ * reason it is not, you need to add the following lines:
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR sample_trace
+/*
+ * But the above is only needed if TRACE_SYSTEM is not alpha-numeric
+ * and underscored. By default, TRACE_SYSTEM_VAR will be equal to
+ * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
+ * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
+ * only alpha-numeric and underscores.
+ *
+ * The TRACE_SYSTEM_VAR is only used internally and not visible to
+ * user space.
+ */
/*
* Notice that this file is not protected like a normal header.
@@ -54,43 +73,527 @@
* Here it is simply "foo, bar".
*
* struct: This defines the way the data will be stored in the ring buffer.
- * There are currently two types of elements. __field and __array.
- * a __field is broken up into (type, name). Where type can be any
- * type but an array.
- * For an array. there are three fields. (type, name, size). The
- * type of elements in the array, the name of the field and the size
- * of the array.
+ * The items declared here become part of a special structure
+ * called "__entry", which can be used in the fast_assign part of the
+ * TRACE_EVENT macro.
+ *
+ * Here are the currently defined types you can use:
+ *
+ * __field : Is broken up into type and name. Where type can be any
+ * primitive type (integer, long or pointer).
+ *
+ * __field(int, foo)
+ *
+ * __entry->foo = 5;
+ *
+ * __field_struct : This can be any static complex data type (struct, union
+ * but not an array). Be careful using complex types, as each
+ * event is limited in size, and copying large amounts of data
+ * into the ring buffer can slow things down.
+ *
+ * __field_struct(struct bar, foo)
+ *
+ * __entry->bar.x = y;
+
+ * __array: There are three fields (type, name, size). The type is the
+ * type of elements in the array, the name is the name of the array.
+ * size is the number of items in the array (not the total size).
+ *
+ * __array( char, foo, 10) is the same as saying: char foo[10];
+ *
+ * Assigning arrays can be done like any array:
+ *
+ * __entry->foo[0] = 'a';
+ *
+ * memcpy(__entry->foo, bar, 10);
+ *
+ * __dynamic_array: This is similar to array, but can vary its size from
+ * instance to instance of the tracepoint being called.
+ * Like __array, this too has three elements (type, name, size);
+ * type is the type of the element, name is the name of the array.
+ * The size is different than __array. It is not a static number,
+ * but the algorithm to figure out the length of the array for the
+ * specific instance of tracepoint. Again, size is the number of
+ * items in the array, not the total length in bytes.
+ *
+ * __dynamic_array( int, foo, bar) is similar to: int foo[bar];
+ *
+ * Note, unlike arrays, you must use the __get_dynamic_array() macro
+ * to access the array.
+ *
+ * memcpy(__get_dynamic_array(foo), bar, 10);
+ *
+ * Notice, that "__entry" is not needed here.
+ *
+ * __string: This is a special kind of __dynamic_array. It expects to
+ * have a null terminated character array passed to it (it allows
+ * for NULL too, which would be converted into "(null)"). __string
+ * takes two parameter (name, src), where name is the name of
+ * the string saved, and src is the string to copy into the
+ * ring buffer.
+ *
+ * __string(foo, bar) is similar to: strcpy(foo, bar)
+ *
+ * To assign a string, use the helper macro __assign_str().
+ *
+ * __assign_str(foo);
+ *
+ * The __string() macro saves off the string that is passed into
+ * the second parameter, and the __assign_str() will store than
+ * saved string into the "foo" field.
+ *
+ * __vstring: This is similar to __string() but instead of taking a
+ * dynamic length, it takes a variable list va_list 'va' variable.
+ * Some event callers already have a message from parameters saved
+ * in a va_list. Passing in the format and the va_list variable
+ * will save just enough on the ring buffer for that string.
+ * Note, the va variable used is a pointer to a va_list, not
+ * to the va_list directly.
+ *
+ * (va_list *va)
+ *
+ * __vstring(foo, fmt, va) is similar to: vsnprintf(foo, fmt, va)
+ *
+ * To assign the string, use the helper macro __assign_vstr().
+ *
+ * __assign_vstr(foo, fmt, va);
+ *
+ * In most cases, the __assign_vstr() macro will take the same
+ * parameters as the __vstring() macro had to declare the string.
+ * Use __get_str() to retrieve the __vstring() just like it would for
+ * __string().
+ *
+ * __string_len: This is a helper to a __dynamic_array, but it understands
+ * that the array has characters in it, it will allocate 'len' + 1 bytes
+ * in the ring buffer and add a '\0' to the string. This is
+ * useful if the string being saved has no terminating '\0' byte.
+ * It requires that the length of the string is known as it acts
+ * like a memcpy().
+ *
+ * Declared with:
+ *
+ * __string_len(foo, bar, len)
+ *
+ * To assign this string, use the helper macro __assign_str().
+ * The length is saved via the __string_len() and is retrieved in
+ * __assign_str().
+ *
+ * __assign_str(foo);
+ *
+ * Then len + 1 is allocated to the ring buffer, and a nul terminating
+ * byte is added. This is similar to:
+ *
+ * memcpy(__get_str(foo), bar, len);
+ * __get_str(foo)[len] = 0;
+ *
+ * The advantage of using this over __dynamic_array, is that it
+ * takes care of allocating the extra byte on the ring buffer
+ * for the '\0' terminating byte, and __get_str(foo) can be used
+ * in the TP_printk().
+ *
+ * __bitmask: This is another kind of __dynamic_array, but it expects
+ * an array of longs, and the number of bits to parse. It takes
+ * two parameters (name, nr_bits), where name is the name of the
+ * bitmask to save, and the nr_bits is the number of bits to record.
+ *
+ * __bitmask(target_cpu, nr_cpumask_bits)
+ *
+ * To assign a bitmask, use the __assign_bitmask() helper macro.
+ *
+ * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits);
*
- * __array( char, foo, 10) is the same as saying char foo[10].
+ * __cpumask: This is pretty much the same as __bitmask but is specific for
+ * CPU masks. The type displayed to the user via the format files will
+ * be "cpumaks_t" such that user space may deal with them differently
+ * if they choose to do so, and the bits is always set to nr_cpumask_bits.
+ *
+ * __cpumask(target_cpu)
+ *
+ * To assign a cpumask, use the __assign_cpumask() helper macro.
+ *
+ * __assign_cpumask(target_cpus, cpumask_bits(bar));
*
* fast_assign: This is a C like function that is used to store the items
- * into the ring buffer.
+ * into the ring buffer. A special variable called "__entry" will be the
+ * structure that points into the ring buffer and has the same fields as
+ * described by the struct part of TRACE_EVENT above.
*
* printk: This is a way to print out the data in pretty print. This is
* useful if the system crashes and you are logging via a serial line,
* the data can be printed to the console using this "printk" method.
+ * This is also used to print out the data from the trace files.
+ * Again, the __entry macro is used to access the data from the ring buffer.
+ *
+ * Note, __dynamic_array, __string, __bitmask and __cpumask require special
+ * helpers to access the data.
+ *
+ * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo)
+ * Use __get_dynamic_array_len(foo) to get the length of the array
+ * saved. Note, __get_dynamic_array_len() returns the total allocated
+ * length of the dynamic array; __print_array() expects the second
+ * parameter to be the number of elements. To get that, the array length
+ * needs to be divided by the element size.
+ *
+ * For __string(foo, bar) use __get_str(foo)
+ *
+ * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus)
+ *
+ * For __cpumask(target_cpus) use __get_cpumask(target_cpus)
+ *
*
* Note, that for both the assign and the printk, __entry is the handler
* to the data structure in the ring buffer, and is defined by the
* TP_STRUCT__entry.
*/
+
+/*
+ * It is OK to have helper functions in the file, but they need to be protected
+ * from being defined more than once. Remember, this file gets included more
+ * than once.
+ */
+#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+static inline int __length_of(const int *list)
+{
+ int i;
+
+ if (!list)
+ return 0;
+
+ for (i = 0; list[i]; i++)
+ ;
+ return i;
+}
+
+enum {
+ TRACE_SAMPLE_FOO = 2,
+ TRACE_SAMPLE_BAR = 4,
+ TRACE_SAMPLE_ZOO = 8,
+};
+#endif
+
+/*
+ * If enums are used in the TP_printk(), their names will be shown in
+ * format files and not their values. This can cause problems with user
+ * space programs that parse the format files to know how to translate
+ * the raw binary trace output into human readable text.
+ *
+ * To help out user space programs, any enum that is used in the TP_printk()
+ * should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to
+ * be done is to add this macro with the enum within it in the trace
+ * header file, and it will be converted in the output.
+ */
+
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO);
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR);
+TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO);
+
TRACE_EVENT(foo_bar,
- TP_PROTO(char *foo, int bar),
+ TP_PROTO(const char *foo, int bar, const int *lst,
+ const char *string, const struct cpumask *mask,
+ const char *fmt, va_list *va),
- TP_ARGS(foo, bar),
+ TP_ARGS(foo, bar, lst, string, mask, fmt, va),
TP_STRUCT__entry(
__array( char, foo, 10 )
__field( int, bar )
+ __dynamic_array(int, list, __length_of(lst))
+ __string( str, string )
+ __bitmask( cpus, num_possible_cpus() )
+ __cpumask( cpum )
+ __vstring( vstr, fmt, va )
+ __string_len( lstr, foo, bar / 2 < strlen(foo) ? bar / 2 : strlen(foo) )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->foo, foo, 10);
+ __entry->bar = bar;
+ memcpy(__get_dynamic_array(list), lst,
+ __length_of(lst) * sizeof(int));
+ __assign_str(str);
+ __assign_str(lstr);
+ __assign_vstr(vstr, fmt, va);
+ __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
+ __assign_cpumask(cpum, cpumask_bits(mask));
+ ),
+
+ TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl",
+ __entry->foo, __entry->bar,
+
+/*
+ * Notice here the use of some helper functions. This includes:
+ *
+ * __print_symbolic( variable, { value, "string" }, ... ),
+ *
+ * The variable is tested against each value of the { } pair. If
+ * the variable matches one of the values, then it will print the
+ * string in that pair. If non are matched, it returns a string
+ * version of the number (if __entry->bar == 7 then "7" is returned).
+ */
+ __print_symbolic(__entry->bar,
+ { 0, "zero" },
+ { TRACE_SAMPLE_FOO, "TWO" },
+ { TRACE_SAMPLE_BAR, "FOUR" },
+ { TRACE_SAMPLE_ZOO, "EIGHT" },
+ { 10, "TEN" }
+ ),
+
+/*
+ * __print_flags( variable, "delim", { value, "flag" }, ... ),
+ *
+ * This is similar to __print_symbolic, except that it tests the bits
+ * of the value. If ((FLAG & variable) == FLAG) then the string is
+ * printed. If more than one flag matches, then each one that does is
+ * also printed with delim in between them.
+ * If not all bits are accounted for, then the not found bits will be
+ * added in hex format: 0x506 will show BIT2|BIT4|0x500
+ */
+ __print_flags(__entry->bar, "|",
+ { 1, "BIT1" },
+ { 2, "BIT2" },
+ { 4, "BIT3" },
+ { 8, "BIT4" }
+ ),
+/*
+ * __print_array( array, len, element_size )
+ *
+ * This prints out the array that is defined by __array in a nice format.
+ */
+ __print_array(__get_dynamic_array(list),
+ __get_dynamic_array_len(list) / sizeof(int),
+ sizeof(int)),
+
+/* A shortcut is to use __print_dynamic_array for dynamic arrays */
+
+ __print_dynamic_array(list, sizeof(int)),
+
+ __get_str(str), __get_str(lstr),
+ __get_bitmask(cpus), __get_cpumask(cpum),
+ __get_str(vstr),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array(cpus))
+);
+
+/*
+ * There may be a case where a tracepoint should only be called if
+ * some condition is set. Otherwise the tracepoint should not be called.
+ * But to do something like:
+ *
+ * if (cond)
+ * trace_foo();
+ *
+ * Would cause a little overhead when tracing is not enabled, and that
+ * overhead, even if small, is not something we want. As tracepoints
+ * use static branch (aka jump_labels), where no branch is taken to
+ * skip the tracepoint when not enabled, and a jmp is placed to jump
+ * to the tracepoint code when it is enabled, having a if statement
+ * nullifies that optimization. It would be nice to place that
+ * condition within the static branch. This is where TRACE_EVENT_CONDITION
+ * comes in.
+ *
+ * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another
+ * parameter just after args. Where TRACE_EVENT has:
+ *
+ * TRACE_EVENT(name, proto, args, struct, assign, printk)
+ *
+ * the CONDITION version has:
+ *
+ * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk)
+ *
+ * Everything is the same as TRACE_EVENT except for the new cond. Think
+ * of the cond variable as:
+ *
+ * if (cond)
+ * trace_foo_bar_with_cond();
+ *
+ * Except that the logic for the if branch is placed after the static branch.
+ * That is, the if statement that processes the condition will not be
+ * executed unless that traecpoint is enabled. Otherwise it still remains
+ * a nop.
+ */
+TRACE_EVENT_CONDITION(foo_bar_with_cond,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_CONDITION(!(bar % 10)),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
+ ),
+
+ TP_fast_assign(
+ __assign_str(foo);
+ __entry->bar = bar;
+ ),
+
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar)
+);
+
+int foo_bar_reg(void);
+void foo_bar_unreg(void);
+
+/*
+ * Now in the case that some function needs to be called when the
+ * tracepoint is enabled and/or when it is disabled, the
+ * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT()
+ * but adds two more parameters at the end:
+ *
+ * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg)
+ *
+ * reg and unreg are functions with the prototype of:
+ *
+ * void reg(void)
+ *
+ * The reg function gets called before the tracepoint is enabled, and
+ * the unreg function gets called after the tracepoint is disabled.
+ *
+ * Note, reg and unreg are allowed to be NULL. If you only need to
+ * call a function before enabling, or after disabling, just set one
+ * function and pass in NULL for the other parameter.
+ */
+TRACE_EVENT_FN(foo_bar_with_fn,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
),
TP_fast_assign(
- strncpy(__entry->foo, foo, 10);
+ __assign_str(foo);
__entry->bar = bar;
),
- TP_printk("foo %s %d", __entry->foo, __entry->bar)
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar),
+
+ foo_bar_reg, foo_bar_unreg
+);
+
+/*
+ * Each TRACE_EVENT macro creates several helper functions to produce
+ * the code to add the tracepoint, create the files in the trace
+ * directory, hook it to perf, assign the values and to print out
+ * the raw data from the ring buffer. To prevent too much bloat,
+ * if there are more than one tracepoint that uses the same format
+ * for the proto, args, struct, assign and printk, and only the name
+ * is different, it is highly recommended to use the DECLARE_EVENT_CLASS
+ *
+ * DECLARE_EVENT_CLASS() macro creates most of the functions for the
+ * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those
+ * functions. This DEFINE_EVENT() is an instance of the class and can
+ * be enabled and disabled separately from other events (either TRACE_EVENT
+ * or other DEFINE_EVENT()s).
+ *
+ * Note, TRACE_EVENT() itself is simply defined as:
+ *
+ * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \
+ * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \
+ * DEFINE_EVENT(name, name, proto, args)
+ *
+ * The DEFINE_EVENT() also can be declared with conditions and reg functions:
+ *
+ * DEFINE_EVENT_CONDITION(template, name, proto, args, cond);
+ * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg);
+ */
+DECLARE_EVENT_CLASS(foo_template,
+
+ TP_PROTO(const char *foo, int bar),
+
+ TP_ARGS(foo, bar),
+
+ TP_STRUCT__entry(
+ __string( foo, foo )
+ __field( int, bar )
+ ),
+
+ TP_fast_assign(
+ __assign_str(foo);
+ __entry->bar = bar;
+ ),
+
+ TP_printk("foo %s %d", __get_str(foo), __entry->bar)
+);
+
+/*
+ * Here's a better way for the previous samples (except, the first
+ * example had more fields and could not be used here).
+ */
+DEFINE_EVENT(foo_template, foo_with_template_simple,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar));
+
+DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ TP_CONDITION(!(bar % 8)));
+
+
+DEFINE_EVENT_FN(foo_template, foo_with_template_fn,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ foo_bar_reg, foo_bar_unreg);
+
+/*
+ * Anytime two events share basically the same values and have
+ * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT()
+ * when ever possible.
+ */
+
+/*
+ * If the event is similar to the DECLARE_EVENT_CLASS, but you need
+ * to have a different output, then use DEFINE_EVENT_PRINT() which
+ * lets you override the TP_printk() of the class.
+ */
+
+DEFINE_EVENT_PRINT(foo_template, foo_with_template_print,
+ TP_PROTO(const char *foo, int bar),
+ TP_ARGS(foo, bar),
+ TP_printk("bar %s %d", __get_str(foo), __entry->bar));
+
+/*
+ * There are yet another __rel_loc dynamic data attribute. If you
+ * use __rel_dynamic_array() and __rel_string() etc. macros, you
+ * can use this attribute. There is no difference from the viewpoint
+ * of functionality with/without 'rel' but the encoding is a bit
+ * different. This is expected to be used with user-space event,
+ * there is no reason that the kernel event use this, but only for
+ * testing.
+ */
+
+TRACE_EVENT(foo_rel_loc,
+
+ TP_PROTO(const char *foo, int bar, unsigned long *mask, const cpumask_t *cpus),
+
+ TP_ARGS(foo, bar, mask, cpus),
+
+ TP_STRUCT__entry(
+ __rel_string( foo, foo )
+ __field( int, bar )
+ __rel_bitmask( bitmask,
+ BITS_PER_BYTE * sizeof(unsigned long) )
+ __rel_cpumask( cpumask )
+ ),
+
+ TP_fast_assign(
+ __assign_rel_str(foo);
+ __entry->bar = bar;
+ __assign_rel_bitmask(bitmask, mask,
+ BITS_PER_BYTE * sizeof(unsigned long));
+ __assign_rel_cpumask(cpumask, cpus);
+ ),
+
+ TP_printk("foo_rel_loc %s, %d, %s, %s", __get_rel_str(foo), __entry->bar,
+ __get_rel_bitmask(bitmask),
+ __get_rel_cpumask(cpumask))
);
#endif
diff --git a/samples/trace_events/trace_custom_sched.c b/samples/trace_events/trace_custom_sched.c
new file mode 100644
index 000000000000..dd409b704b35
--- /dev/null
+++ b/samples/trace_events/trace_custom_sched.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event tracer
+ *
+ * Copyright (C) 2022 Google Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/trace_events.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+/*
+ * Must include the event header that the custom event will attach to,
+ * from the C file, and not in the custom header file.
+ */
+#include <trace/events/sched.h>
+
+/* Declare CREATE_CUSTOM_TRACE_EVENTS before including custom header */
+#define CREATE_CUSTOM_TRACE_EVENTS
+
+#include "trace_custom_sched.h"
+
+/*
+ * As the trace events are not exported to modules, the use of
+ * for_each_kernel_tracepoint() is needed to find the trace event
+ * to attach to. The fct() function below, is a callback that
+ * will be called for every event.
+ *
+ * Helper functions are created by the TRACE_CUSTOM_EVENT() macro
+ * update the event. Those are of the form:
+ *
+ * trace_custom_event_<event>_update()
+ *
+ * Where <event> is the event to attach.
+ */
+static void fct(struct tracepoint *tp, void *priv)
+{
+ trace_custom_event_sched_switch_update(tp);
+ trace_custom_event_sched_waking_update(tp);
+}
+
+static int __init trace_sched_init(void)
+{
+ for_each_kernel_tracepoint(fct, NULL);
+ return 0;
+}
+
+static void __exit trace_sched_exit(void)
+{
+}
+
+module_init(trace_sched_init);
+module_exit(trace_sched_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("Custom scheduling events");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace_custom_sched.h b/samples/trace_events/trace_custom_sched.h
new file mode 100644
index 000000000000..951388334a3f
--- /dev/null
+++ b/samples/trace_events/trace_custom_sched.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Like the headers that use TRACE_EVENT(), the TRACE_CUSTOM_EVENT()
+ * needs a header that allows for multiple inclusions.
+ *
+ * Test for a unique name (here we have _TRACE_CUSTOM_SCHED_H),
+ * also allowing to continue if TRACE_CUSTOM_MULTI_READ is defined.
+ */
+#if !defined(_TRACE_CUSTOM_SCHED_H) || defined(TRACE_CUSTOM_MULTI_READ)
+#define _TRACE_CUSTOM_SCHED_H
+
+/* Include linux/trace_events.h for initial defines of TRACE_CUSTOM_EVENT() */
+#include <linux/trace_events.h>
+
+/*
+ * TRACE_CUSTOM_EVENT() is just like TRACE_EVENT(). The first parameter
+ * is the event name of an existing event where the TRACE_EVENT has been included
+ * in the C file before including this file.
+ */
+TRACE_CUSTOM_EVENT(sched_switch,
+
+ /*
+ * The TP_PROTO() and TP_ARGS must match the trace event
+ * that the custom event is using.
+ */
+ TP_PROTO(bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state),
+
+ TP_ARGS(preempt, prev, next, prev_state),
+
+ /*
+ * The next fields are where the customization happens.
+ * The TP_STRUCT__entry() defines what will be recorded
+ * in the ring buffer when the custom event triggers.
+ *
+ * The rest is just like the TRACE_EVENT() macro except that
+ * it uses the custom entry.
+ */
+ TP_STRUCT__entry(
+ __field( unsigned short, prev_prio )
+ __field( unsigned short, next_prio )
+ __field( pid_t, next_pid )
+ ),
+
+ TP_fast_assign(
+ __entry->prev_prio = prev->prio;
+ __entry->next_pid = next->pid;
+ __entry->next_prio = next->prio;
+ ),
+
+ TP_printk("prev_prio=%d next_pid=%d next_prio=%d",
+ __entry->prev_prio, __entry->next_pid, __entry->next_prio)
+)
+
+
+TRACE_CUSTOM_EVENT(sched_waking,
+
+ TP_PROTO(struct task_struct *p),
+
+ TP_ARGS(p),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned short, prio )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = p->pid;
+ __entry->prio = p->prio;
+ ),
+
+ TP_printk("pid=%d prio=%d", __entry->pid, __entry->prio)
+)
+#endif
+/*
+ * Just like the headers that create TRACE_EVENTs, the below must
+ * be outside the protection of the above #if block.
+ */
+
+/*
+ * It is required that the Makefile includes:
+ * CFLAGS_<c_file>.o := -I$(src)
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+
+/*
+ * It is requred that the TRACE_INCLUDE_FILE be the same
+ * as this file without the ".h".
+ */
+#define TRACE_INCLUDE_FILE trace_custom_sched
+#include <trace/define_custom_trace.h>
diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile
new file mode 100644
index 000000000000..c0df36167d60
--- /dev/null
+++ b/samples/trace_printk/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# builds a module that calls various trace_printk routines
+# then to use one (as root): insmod <module_name.ko>
+
+# This module can also be used to test the trace_printk code.
+
+obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
new file mode 100644
index 000000000000..cfc159580263
--- /dev/null
+++ b/samples/trace_printk/trace-printk.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/irq_work.h>
+
+/* Must not be static to force gcc to consider these non constant */
+char *trace_printk_test_global_str =
+ "This is a dynamic string that will use trace_puts\n";
+
+char *trace_printk_test_global_str_irq =
+ "(irq) This is a dynamic string that will use trace_puts\n";
+
+char *trace_printk_test_global_str_fmt =
+ "%sThis is a %s that will use trace_printk\n";
+
+static struct irq_work irqwork;
+
+static void trace_printk_irq_work(struct irq_work *work)
+{
+ trace_printk("(irq) This is a static string that will use trace_bputs\n");
+ trace_printk(trace_printk_test_global_str_irq);
+
+ trace_printk("(irq) This is a %s that will use trace_bprintk()\n",
+ "static string");
+
+ trace_printk(trace_printk_test_global_str_fmt,
+ "(irq) ", "dynamic string");
+}
+
+static int __init trace_printk_init(void)
+{
+ init_irq_work(&irqwork, trace_printk_irq_work);
+
+ trace_printk("This is a static string that will use trace_bputs\n");
+ trace_printk(trace_printk_test_global_str);
+
+ /* Kick off printing in irq context */
+ irq_work_queue(&irqwork);
+ irq_work_sync(&irqwork);
+
+ trace_printk("This is a %s that will use trace_bprintk()\n",
+ "static string");
+
+ trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string");
+
+ return 0;
+}
+
+static void __exit trace_printk_exit(void)
+{
+}
+
+module_init(trace_printk_init);
+module_exit(trace_printk_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-printk");
+MODULE_LICENSE("GPL");
diff --git a/samples/tsm-mr/Makefile b/samples/tsm-mr/Makefile
new file mode 100644
index 000000000000..587c3947b3a7
--- /dev/null
+++ b/samples/tsm-mr/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SAMPLE_TSM_MR) += tsm_mr_sample.o
diff --git a/samples/tsm-mr/tsm_mr_sample.c b/samples/tsm-mr/tsm_mr_sample.c
new file mode 100644
index 000000000000..a2c652148639
--- /dev/null
+++ b/samples/tsm-mr/tsm_mr_sample.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024-2005 Intel Corporation. All rights reserved. */
+
+#define pr_fmt(x) KBUILD_MODNAME ": " x
+
+#include <linux/module.h>
+#include <linux/tsm-mr.h>
+#include <linux/miscdevice.h>
+#include <crypto/hash.h>
+
+static struct {
+ u8 static_mr[SHA384_DIGEST_SIZE];
+ u8 config_mr[SHA512_DIGEST_SIZE];
+ u8 rtmr0[SHA256_DIGEST_SIZE];
+ u8 rtmr1[SHA384_DIGEST_SIZE];
+ u8 report_digest[SHA512_DIGEST_SIZE];
+} sample_report = {
+ .static_mr = "static_mr",
+ .config_mr = "config_mr",
+ .rtmr0 = "rtmr0",
+ .rtmr1 = "rtmr1",
+};
+
+static int sample_report_refresh(const struct tsm_measurements *tm)
+{
+ struct crypto_shash *tfm;
+ int rc;
+
+ tfm = crypto_alloc_shash(hash_algo_name[HASH_ALGO_SHA512], 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_err("crypto_alloc_shash failed: %ld\n", PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+
+ rc = crypto_shash_tfm_digest(tfm, (u8 *)&sample_report,
+ offsetof(typeof(sample_report),
+ report_digest),
+ sample_report.report_digest);
+ crypto_free_shash(tfm);
+ if (rc)
+ pr_err("crypto_shash_tfm_digest failed: %d\n", rc);
+ return rc;
+}
+
+static int sample_report_extend_mr(const struct tsm_measurements *tm,
+ const struct tsm_measurement_register *mr,
+ const u8 *data)
+{
+ SHASH_DESC_ON_STACK(desc, 0);
+ int rc;
+
+ desc->tfm = crypto_alloc_shash(hash_algo_name[mr->mr_hash], 0, 0);
+ if (IS_ERR(desc->tfm)) {
+ pr_err("crypto_alloc_shash failed: %ld\n", PTR_ERR(desc->tfm));
+ return PTR_ERR(desc->tfm);
+ }
+
+ rc = crypto_shash_init(desc);
+ if (!rc)
+ rc = crypto_shash_update(desc, mr->mr_value, mr->mr_size);
+ if (!rc)
+ rc = crypto_shash_finup(desc, data, mr->mr_size, mr->mr_value);
+ crypto_free_shash(desc->tfm);
+ if (rc)
+ pr_err("SHA calculation failed: %d\n", rc);
+ return rc;
+}
+
+#define MR_(mr, hash) .mr_value = &sample_report.mr, TSM_MR_(mr, hash)
+static const struct tsm_measurement_register sample_mrs[] = {
+ /* static MR, read-only */
+ { MR_(static_mr, SHA384) },
+ /* config MR, read-only */
+ { MR_(config_mr, SHA512) | TSM_MR_F_NOHASH },
+ /* RTMR, direct extension prohibited */
+ { MR_(rtmr0, SHA256) | TSM_MR_F_LIVE },
+ /* RTMR, direct extension allowed */
+ { MR_(rtmr1, SHA384) | TSM_MR_F_RTMR },
+ /* RTMR, crypto agile, alaised to rtmr0 and rtmr1, respectively */
+ { .mr_value = &sample_report.rtmr0,
+ TSM_MR_(rtmr_crypto_agile, SHA256) | TSM_MR_F_RTMR },
+ { .mr_value = &sample_report.rtmr1,
+ TSM_MR_(rtmr_crypto_agile, SHA384) | TSM_MR_F_RTMR },
+ /* sha512 digest of the whole structure */
+ { MR_(report_digest, SHA512) | TSM_MR_F_LIVE },
+};
+#undef MR_
+
+static struct tsm_measurements sample_tm = {
+ .mrs = sample_mrs,
+ .nr_mrs = ARRAY_SIZE(sample_mrs),
+ .refresh = sample_report_refresh,
+ .write = sample_report_extend_mr,
+};
+
+static const struct attribute_group *sample_groups[] = {
+ NULL,
+ NULL,
+};
+
+static struct miscdevice sample_misc_dev = {
+ .name = KBUILD_MODNAME,
+ .minor = MISC_DYNAMIC_MINOR,
+ .groups = sample_groups,
+};
+
+static int __init tsm_mr_sample_init(void)
+{
+ int rc;
+
+ sample_groups[0] = tsm_mr_create_attribute_group(&sample_tm);
+ if (IS_ERR(sample_groups[0]))
+ return PTR_ERR(sample_groups[0]);
+
+ rc = misc_register(&sample_misc_dev);
+ if (rc)
+ tsm_mr_free_attribute_group(sample_groups[0]);
+ return rc;
+}
+
+static void __exit tsm_mr_sample_exit(void)
+{
+ misc_deregister(&sample_misc_dev);
+ tsm_mr_free_attribute_group(sample_groups[0]);
+}
+
+module_init(tsm_mr_sample_init);
+module_exit(tsm_mr_sample_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Sample module using tsm-mr to expose emulated MRs");
diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore
new file mode 100644
index 000000000000..0e0a5a929f5d
--- /dev/null
+++ b/samples/uhid/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/uhid-example
diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile
index c95a696560a7..0aa424ec4899 100644
--- a/samples/uhid/Makefile
+++ b/samples/uhid/Makefile
@@ -1,10 +1,4 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += uhid-example
-# List of programs to build
-hostprogs-y := uhid-example
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-
-HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include
+userccflags += -I usr/include
diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c
index 03ce3c059a5e..015cb06a241e 100644
--- a/samples/uhid/uhid-example.c
+++ b/samples/uhid/uhid-example.c
@@ -1,14 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* UHID Example
*
- * Copyright (c) 2012 David Herrmann <dh.herrmann@googlemail.com>
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
*
* The code may be used by anyone for any purpose,
* and can serve as a starting point for developing
* applications using uhid.
*/
-/* UHID Example
+/*
+ * UHID Example
* This example emulates a basic 3 buttons mouse with wheel over UHID. Run this
* program as root and then use the following keys to control the mouse:
* q: Quit the application
@@ -22,6 +24,11 @@
* r: Move wheel up
* f: Move wheel down
*
+ * Additionally to 3 button mouse, 3 keyboard LEDs are also supported (LED_NUML,
+ * LED_CAPSL and LED_SCROLLL). The device doesn't generate any related keyboard
+ * events, though. You need to manually write the EV_LED/LED_XY/1 activation
+ * input event to the evdev device to see it being sent to this device.
+ *
* If uhid is not available as /dev/uhid, then you can pass a different path as
* first argument.
* If <linux/uhid.h> is not installed in /usr, then compile this with:
@@ -41,11 +48,12 @@
#include <unistd.h>
#include <linux/uhid.h>
-/* HID Report Desciptor
- * We emulate a basic 3 button mouse with wheel. This is the report-descriptor
- * as the kernel will parse it:
+/*
+ * HID Report Desciptor
+ * We emulate a basic 3 button mouse with wheel and 3 keyboard LEDs. This is
+ * the report-descriptor as the kernel will parse it:
*
- * INPUT[INPUT]
+ * INPUT(1)[INPUT]
* Field(0)
* Physical(GenericDesktop.Pointer)
* Application(GenericDesktop.Mouse)
@@ -72,6 +80,19 @@
* Report Count(3)
* Report Offset(8)
* Flags( Variable Relative )
+ * OUTPUT(2)[OUTPUT]
+ * Field(0)
+ * Application(GenericDesktop.Keyboard)
+ * Usage(3)
+ * LED.NumLock
+ * LED.CapsLock
+ * LED.ScrollLock
+ * Logical Minimum(0)
+ * Logical Maximum(1)
+ * Report Size(1)
+ * Report Count(3)
+ * Report Offset(0)
+ * Flags( Variable Absolute )
*
* This is the mapping that we expect:
* Button.0001 ---> Key.LeftBtn
@@ -80,19 +101,59 @@
* GenericDesktop.X ---> Relative.X
* GenericDesktop.Y ---> Relative.Y
* GenericDesktop.Wheel ---> Relative.Wheel
+ * LED.NumLock ---> LED.NumLock
+ * LED.CapsLock ---> LED.CapsLock
+ * LED.ScrollLock ---> LED.ScrollLock
*
* This information can be verified by reading /sys/kernel/debug/hid/<dev>/rdesc
* This file should print the same information as showed above.
*/
static unsigned char rdesc[] = {
- 0x05, 0x01, 0x09, 0x02, 0xa1, 0x01, 0x09, 0x01,
- 0xa1, 0x00, 0x05, 0x09, 0x19, 0x01, 0x29, 0x03,
- 0x15, 0x00, 0x25, 0x01, 0x95, 0x03, 0x75, 0x01,
- 0x81, 0x02, 0x95, 0x01, 0x75, 0x05, 0x81, 0x01,
- 0x05, 0x01, 0x09, 0x30, 0x09, 0x31, 0x09, 0x38,
- 0x15, 0x80, 0x25, 0x7f, 0x75, 0x08, 0x95, 0x03,
- 0x81, 0x06, 0xc0, 0xc0,
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x02, /* USAGE (Mouse) */
+ 0xa1, 0x01, /* COLLECTION (Application) */
+ 0x09, 0x01, /* USAGE (Pointer) */
+ 0xa1, 0x00, /* COLLECTION (Physical) */
+ 0x85, 0x01, /* REPORT_ID (1) */
+ 0x05, 0x09, /* USAGE_PAGE (Button) */
+ 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */
+ 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x75, 0x01, /* REPORT_SIZE (1) */
+ 0x81, 0x02, /* INPUT (Data,Var,Abs) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x75, 0x05, /* REPORT_SIZE (5) */
+ 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x30, /* USAGE (X) */
+ 0x09, 0x31, /* USAGE (Y) */
+ 0x09, 0x38, /* USAGE (WHEEL) */
+ 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */
+ 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */
+ 0x75, 0x08, /* REPORT_SIZE (8) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x81, 0x06, /* INPUT (Data,Var,Rel) */
+ 0xc0, /* END_COLLECTION */
+ 0xc0, /* END_COLLECTION */
+ 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
+ 0x09, 0x06, /* USAGE (Keyboard) */
+ 0xa1, 0x01, /* COLLECTION (Application) */
+ 0x85, 0x02, /* REPORT_ID (2) */
+ 0x05, 0x08, /* USAGE_PAGE (Led) */
+ 0x19, 0x01, /* USAGE_MINIMUM (1) */
+ 0x29, 0x03, /* USAGE_MAXIMUM (3) */
+ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
+ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
+ 0x95, 0x03, /* REPORT_COUNT (3) */
+ 0x75, 0x01, /* REPORT_SIZE (1) */
+ 0x91, 0x02, /* Output (Data,Var,Abs) */
+ 0x95, 0x01, /* REPORT_COUNT (1) */
+ 0x75, 0x05, /* REPORT_SIZE (5) */
+ 0x91, 0x01, /* Output (Cnst,Var,Abs) */
+ 0xc0, /* END_COLLECTION */
};
static int uhid_write(int fd, const struct uhid_event *ev)
@@ -104,7 +165,7 @@ static int uhid_write(int fd, const struct uhid_event *ev)
fprintf(stderr, "Cannot write to uhid: %m\n");
return -errno;
} else if (ret != sizeof(*ev)) {
- fprintf(stderr, "Wrong size written to uhid: %ld != %lu\n",
+ fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n",
ret, sizeof(ev));
return -EFAULT;
} else {
@@ -140,6 +201,27 @@ static void destroy(int fd)
uhid_write(fd, &ev);
}
+/* This parses raw output reports sent by the kernel to the device. A normal
+ * uhid program shouldn't do this but instead just forward the raw report.
+ * However, for ducomentational purposes, we try to detect LED events here and
+ * print debug messages for it. */
+static void handle_output(struct uhid_event *ev)
+{
+ /* LED messages are adverised via OUTPUT reports; ignore the rest */
+ if (ev->u.output.rtype != UHID_OUTPUT_REPORT)
+ return;
+ /* LED reports have length 2 bytes */
+ if (ev->u.output.size != 2)
+ return;
+ /* first byte is report-id which is 0x02 for LEDs in our rdesc */
+ if (ev->u.output.data[0] != 0x2)
+ return;
+
+ /* print flags payload */
+ fprintf(stderr, "LED output report received with flags %x\n",
+ ev->u.output.data[1]);
+}
+
static int event(int fd)
{
struct uhid_event ev;
@@ -154,7 +236,7 @@ static int event(int fd)
fprintf(stderr, "Cannot read uhid-cdev: %m\n");
return -errno;
} else if (ret != sizeof(ev)) {
- fprintf(stderr, "Invalid size read from uhid-dev: %ld != %lu\n",
+ fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n",
ret, sizeof(ev));
return -EFAULT;
}
@@ -174,6 +256,7 @@ static int event(int fd)
break;
case UHID_OUTPUT:
fprintf(stderr, "UHID_OUTPUT from uhid-dev\n");
+ handle_output(&ev);
break;
case UHID_OUTPUT_EV:
fprintf(stderr, "UHID_OUTPUT_EV from uhid-dev\n");
@@ -198,18 +281,19 @@ static int send_event(int fd)
memset(&ev, 0, sizeof(ev));
ev.type = UHID_INPUT;
- ev.u.input.size = 4;
+ ev.u.input.size = 5;
+ ev.u.input.data[0] = 0x1;
if (btn1_down)
- ev.u.input.data[0] |= 0x1;
+ ev.u.input.data[1] |= 0x1;
if (btn2_down)
- ev.u.input.data[0] |= 0x2;
+ ev.u.input.data[1] |= 0x2;
if (btn3_down)
- ev.u.input.data[0] |= 0x4;
+ ev.u.input.data[1] |= 0x4;
- ev.u.input.data[1] = abs_hor;
- ev.u.input.data[2] = abs_ver;
- ev.u.input.data[3] = wheel;
+ ev.u.input.data[2] = abs_hor;
+ ev.u.input.data[3] = abs_ver;
+ ev.u.input.data[4] = wheel;
return uhid_write(fd, &ev);
}
diff --git a/samples/user_events/Makefile b/samples/user_events/Makefile
new file mode 100644
index 000000000000..7252b589db57
--- /dev/null
+++ b/samples/user_events/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wl,-no-as-needed -Wall -I../../usr/include
+
+example: example.o
+example.o: example.c
diff --git a/samples/user_events/example.c b/samples/user_events/example.c
new file mode 100644
index 000000000000..28165a096697
--- /dev/null
+++ b/samples/user_events/example.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/user_events.h>
+
+const char *data_file = "/sys/kernel/tracing/user_events_data";
+int enabled = 0;
+
+static int event_reg(int fd, const char *command, int *write, int *enabled)
+{
+ struct user_reg reg = {0};
+
+ reg.size = sizeof(reg);
+ reg.enable_bit = 31;
+ reg.enable_size = sizeof(*enabled);
+ reg.enable_addr = (__u64)enabled;
+ reg.name_args = (__u64)command;
+
+ if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
+ return -1;
+
+ *write = reg.write_index;
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int data_fd, write;
+ struct iovec io[2];
+ __u32 count = 0;
+
+ data_fd = open(data_file, O_RDWR);
+
+ if (event_reg(data_fd, "test u32 count", &write, &enabled) == -1)
+ return errno;
+
+ /* Setup iovec */
+ io[0].iov_base = &write;
+ io[0].iov_len = sizeof(write);
+ io[1].iov_base = &count;
+ io[1].iov_len = sizeof(count);
+ask:
+ printf("Press enter to check status...\n");
+ getchar();
+
+ /* Check if anyone is listening */
+ if (enabled) {
+ /* Yep, trace out our data */
+ writev(data_fd, (const struct iovec *)io, 2);
+
+ /* Increase the count */
+ count++;
+
+ printf("Something was attached, wrote data\n");
+ }
+
+ goto ask;
+
+ return 0;
+}
diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile
new file mode 100644
index 000000000000..f86ab1245810
--- /dev/null
+++ b/samples/v4l/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
new file mode 100644
index 000000000000..69925d30329e
--- /dev/null
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -0,0 +1,900 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source
+ * for use with other PCI drivers.
+ *
+ * This skeleton PCI driver assumes that the card has an S-Video connector as
+ * input 0 and an HDMI connector as input 1.
+ *
+ * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/videodev2.h>
+#include <linux/v4l2-dv-timings.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-dev.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-dv-timings.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-event.h>
+#include <media/videobuf2-v4l2.h>
+#include <media/videobuf2-dma-contig.h>
+
+MODULE_DESCRIPTION("V4L2 PCI Skeleton Driver");
+MODULE_AUTHOR("Hans Verkuil");
+MODULE_LICENSE("GPL v2");
+
+/**
+ * struct skeleton - All internal data for one instance of device
+ * @pdev: PCI device
+ * @v4l2_dev: top-level v4l2 device struct
+ * @vdev: video node structure
+ * @ctrl_handler: control handler structure
+ * @lock: ioctl serialization mutex
+ * @std: current SDTV standard
+ * @timings: current HDTV timings
+ * @format: current pix format
+ * @input: current video input (0 = SDTV, 1 = HDTV)
+ * @queue: vb2 video capture queue
+ * @qlock: spinlock controlling access to buf_list and sequence
+ * @buf_list: list of buffers queued for DMA
+ * @field: the field (TOP/BOTTOM/other) of the current buffer
+ * @sequence: frame sequence counter
+ */
+struct skeleton {
+ struct pci_dev *pdev;
+ struct v4l2_device v4l2_dev;
+ struct video_device vdev;
+ struct v4l2_ctrl_handler ctrl_handler;
+ struct mutex lock;
+ v4l2_std_id std;
+ struct v4l2_dv_timings timings;
+ struct v4l2_pix_format format;
+ unsigned input;
+
+ struct vb2_queue queue;
+
+ spinlock_t qlock;
+ struct list_head buf_list;
+ unsigned field;
+ unsigned sequence;
+};
+
+struct skel_buffer {
+ struct vb2_v4l2_buffer vb;
+ struct list_head list;
+};
+
+static inline struct skel_buffer *to_skel_buffer(struct vb2_v4l2_buffer *vbuf)
+{
+ return container_of(vbuf, struct skel_buffer, vb);
+}
+
+static const struct pci_device_id skeleton_pci_tbl[] = {
+ /* { PCI_DEVICE(PCI_VENDOR_ID_, PCI_DEVICE_ID_) }, */
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, skeleton_pci_tbl);
+
+/*
+ * HDTV: this structure has the capabilities of the HDTV receiver.
+ * It is used to constrain the huge list of possible formats based
+ * upon the hardware capabilities.
+ */
+static const struct v4l2_dv_timings_cap skel_timings_cap = {
+ .type = V4L2_DV_BT_656_1120,
+ /* keep this initialization for compatibility with GCC < 4.4.6 */
+ .reserved = { 0 },
+ V4L2_INIT_BT_TIMINGS(
+ 720, 1920, /* min/max width */
+ 480, 1080, /* min/max height */
+ 27000000, 74250000, /* min/max pixelclock*/
+ V4L2_DV_BT_STD_CEA861, /* Supported standards */
+ /* capabilities */
+ V4L2_DV_BT_CAP_INTERLACED | V4L2_DV_BT_CAP_PROGRESSIVE
+ )
+};
+
+/*
+ * Supported SDTV standards. This does the same job as skel_timings_cap, but
+ * for standard TV formats.
+ */
+#define SKEL_TVNORMS V4L2_STD_ALL
+
+/*
+ * Interrupt handler: typically interrupts happen after a new frame has been
+ * captured. It is the job of the handler to remove the new frame from the
+ * internal list and give it back to the vb2 framework, updating the sequence
+ * counter, field and timestamp at the same time.
+ */
+static irqreturn_t skeleton_irq(int irq, void *dev_id)
+{
+#ifdef TODO
+ struct skeleton *skel = dev_id;
+
+ /* handle interrupt */
+
+ /* Once a new frame has been captured, mark it as done like this: */
+ if (captured_new_frame) {
+ ...
+ spin_lock(&skel->qlock);
+ list_del(&new_buf->list);
+ spin_unlock(&skel->qlock);
+ new_buf->vb.vb2_buf.timestamp = ktime_get_ns();
+ new_buf->vb.sequence = skel->sequence++;
+ new_buf->vb.field = skel->field;
+ if (skel->format.field == V4L2_FIELD_ALTERNATE) {
+ if (skel->field == V4L2_FIELD_BOTTOM)
+ skel->field = V4L2_FIELD_TOP;
+ else if (skel->field == V4L2_FIELD_TOP)
+ skel->field = V4L2_FIELD_BOTTOM;
+ }
+ vb2_buffer_done(&new_buf->vb.vb2_buf, VB2_BUF_STATE_DONE);
+ }
+#endif
+ return IRQ_HANDLED;
+}
+
+/*
+ * Setup the constraints of the queue: besides setting the number of planes
+ * per buffer and the size and allocation context of each plane, it also
+ * checks if sufficient buffers have been allocated. Usually 3 is a good
+ * minimum number: many DMA engines need a minimum of 2 buffers in the
+ * queue and you need to have another available for userspace processing.
+ */
+static int queue_setup(struct vb2_queue *vq,
+ unsigned int *nbuffers, unsigned int *nplanes,
+ unsigned int sizes[], struct device *alloc_devs[])
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+ unsigned int q_num_bufs = vb2_get_num_buffers(vq);
+
+ skel->field = skel->format.field;
+ if (skel->field == V4L2_FIELD_ALTERNATE) {
+ /*
+ * You cannot use read() with FIELD_ALTERNATE since the field
+ * information (TOP/BOTTOM) cannot be passed back to the user.
+ */
+ if (vb2_fileio_is_active(vq))
+ return -EINVAL;
+ skel->field = V4L2_FIELD_TOP;
+ }
+
+ if (q_num_bufs + *nbuffers < 3)
+ *nbuffers = 3 - q_num_bufs;
+
+ if (*nplanes)
+ return sizes[0] < skel->format.sizeimage ? -EINVAL : 0;
+ *nplanes = 1;
+ sizes[0] = skel->format.sizeimage;
+ return 0;
+}
+
+/*
+ * Prepare the buffer for queueing to the DMA engine: check and set the
+ * payload size.
+ */
+static int buffer_prepare(struct vb2_buffer *vb)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
+ unsigned long size = skel->format.sizeimage;
+
+ if (vb2_plane_size(vb, 0) < size) {
+ dev_err(&skel->pdev->dev, "buffer too small (%lu < %lu)\n",
+ vb2_plane_size(vb, 0), size);
+ return -EINVAL;
+ }
+
+ vb2_set_plane_payload(vb, 0, size);
+ return 0;
+}
+
+/*
+ * Queue this buffer to the DMA engine.
+ */
+static void buffer_queue(struct vb2_buffer *vb)
+{
+ struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+ struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
+ struct skel_buffer *buf = to_skel_buffer(vbuf);
+ unsigned long flags;
+
+ spin_lock_irqsave(&skel->qlock, flags);
+ list_add_tail(&buf->list, &skel->buf_list);
+
+ /* TODO: Update any DMA pointers if necessary */
+
+ spin_unlock_irqrestore(&skel->qlock, flags);
+}
+
+static void return_all_buffers(struct skeleton *skel,
+ enum vb2_buffer_state state)
+{
+ struct skel_buffer *buf, *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skel->qlock, flags);
+ list_for_each_entry_safe(buf, node, &skel->buf_list, list) {
+ vb2_buffer_done(&buf->vb.vb2_buf, state);
+ list_del(&buf->list);
+ }
+ spin_unlock_irqrestore(&skel->qlock, flags);
+}
+
+/*
+ * Start streaming. First check if the minimum number of buffers have been
+ * queued. If not, then return -ENOBUFS and the vb2 framework will call
+ * this function again the next time a buffer has been queued until enough
+ * buffers are available to actually start the DMA engine.
+ */
+static int start_streaming(struct vb2_queue *vq, unsigned int count)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+ int ret = 0;
+
+ skel->sequence = 0;
+
+ /* TODO: start DMA */
+
+ if (ret) {
+ /*
+ * In case of an error, return all active buffers to the
+ * QUEUED state
+ */
+ return_all_buffers(skel, VB2_BUF_STATE_QUEUED);
+ }
+ return ret;
+}
+
+/*
+ * Stop the DMA engine. Any remaining buffers in the DMA queue are dequeued
+ * and passed on to the vb2 framework marked as STATE_ERROR.
+ */
+static void stop_streaming(struct vb2_queue *vq)
+{
+ struct skeleton *skel = vb2_get_drv_priv(vq);
+
+ /* TODO: stop DMA */
+
+ /* Release all active buffers */
+ return_all_buffers(skel, VB2_BUF_STATE_ERROR);
+}
+
+/*
+ * The vb2 queue ops.
+ */
+static const struct vb2_ops skel_qops = {
+ .queue_setup = queue_setup,
+ .buf_prepare = buffer_prepare,
+ .buf_queue = buffer_queue,
+ .start_streaming = start_streaming,
+ .stop_streaming = stop_streaming,
+};
+
+/*
+ * Required ioctl querycap. Note that the version field is prefilled with
+ * the version of the kernel.
+ */
+static int skeleton_querycap(struct file *file, void *priv,
+ struct v4l2_capability *cap)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ strscpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver));
+ strscpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card));
+ snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s",
+ pci_name(skel->pdev));
+ return 0;
+}
+
+/*
+ * Helper function to check and correct struct v4l2_pix_format. It's used
+ * not only in VIDIOC_TRY/S_FMT, but also elsewhere if changes to the SDTV
+ * standard, HDTV timings or the video input would require updating the
+ * current format.
+ */
+static void skeleton_fill_pix_format(struct skeleton *skel,
+ struct v4l2_pix_format *pix)
+{
+ pix->pixelformat = V4L2_PIX_FMT_YUYV;
+ if (skel->input == 0) {
+ /* S-Video input */
+ pix->width = 720;
+ pix->height = (skel->std & V4L2_STD_525_60) ? 480 : 576;
+ pix->field = V4L2_FIELD_INTERLACED;
+ pix->colorspace = V4L2_COLORSPACE_SMPTE170M;
+ } else {
+ /* HDMI input */
+ pix->width = skel->timings.bt.width;
+ pix->height = skel->timings.bt.height;
+ if (skel->timings.bt.interlaced) {
+ pix->field = V4L2_FIELD_ALTERNATE;
+ pix->height /= 2;
+ } else {
+ pix->field = V4L2_FIELD_NONE;
+ }
+ pix->colorspace = V4L2_COLORSPACE_REC709;
+ }
+
+ /*
+ * The YUYV format is four bytes for every two pixels, so bytesperline
+ * is width * 2.
+ */
+ pix->bytesperline = pix->width * 2;
+ pix->sizeimage = pix->bytesperline * pix->height;
+ pix->priv = 0;
+}
+
+static int skeleton_try_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+ struct v4l2_pix_format *pix = &f->fmt.pix;
+
+ /*
+ * Due to historical reasons providing try_fmt with an unsupported
+ * pixelformat will return -EINVAL for video receivers. Webcam drivers,
+ * however, will silently correct the pixelformat. Some video capture
+ * applications rely on this behavior...
+ */
+ if (pix->pixelformat != V4L2_PIX_FMT_YUYV)
+ return -EINVAL;
+ skeleton_fill_pix_format(skel, pix);
+ return 0;
+}
+
+static int skeleton_s_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+ int ret;
+
+ ret = skeleton_try_fmt_vid_cap(file, priv, f);
+ if (ret)
+ return ret;
+
+ /*
+ * It is not allowed to change the format while buffers for use with
+ * streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: change format */
+ skel->format = f->fmt.pix;
+ return 0;
+}
+
+static int skeleton_g_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_format *f)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ f->fmt.pix = skel->format;
+ return 0;
+}
+
+static int skeleton_enum_fmt_vid_cap(struct file *file, void *priv,
+ struct v4l2_fmtdesc *f)
+{
+ if (f->index != 0)
+ return -EINVAL;
+
+ f->pixelformat = V4L2_PIX_FMT_YUYV;
+ return 0;
+}
+
+static int skeleton_s_std(struct file *file, void *priv, v4l2_std_id std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* S_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+ /*
+ * No change, so just return. Some applications call S_STD again after
+ * the buffers for streaming have been set up, so we have to allow for
+ * this behavior.
+ */
+ if (std == skel->std)
+ return 0;
+
+ /*
+ * Changing the standard implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: handle changing std */
+
+ skel->std = std;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_std(struct file *file, void *priv, v4l2_std_id *std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* G_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+ *std = skel->std;
+ return 0;
+}
+
+/*
+ * Query the current standard as seen by the hardware. This function shall
+ * never actually change the standard, it just detects and reports.
+ * The framework will initially set *std to tvnorms (i.e. the set of
+ * supported standards by this input), and this function should just AND
+ * this value. If there is no signal, then *std should be set to 0.
+ */
+static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* QUERY_STD is not supported on the HDMI input */
+ if (skel->input)
+ return -ENODATA;
+
+#ifdef TODO
+ /*
+ * Query currently seen standard. Initial value of *std is
+ * V4L2_STD_ALL. This function should look something like this:
+ */
+ get_signal_info();
+ if (no_signal) {
+ *std = 0;
+ return 0;
+ }
+ /* Use signal information to reduce the number of possible standards */
+ if (signal_has_525_lines)
+ *std &= V4L2_STD_525_60;
+ else
+ *std &= V4L2_STD_625_50;
+#endif
+ return 0;
+}
+
+static int skeleton_s_dv_timings(struct file *file, void *priv,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* S_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ /* Quick sanity check */
+ if (!v4l2_valid_dv_timings(timings, &skel_timings_cap, NULL, NULL))
+ return -EINVAL;
+
+ /* Check if the timings are part of the CEA-861 timings. */
+ if (!v4l2_find_dv_timings_cap(timings, &skel_timings_cap,
+ 0, NULL, NULL))
+ return -EINVAL;
+
+ /* Return 0 if the new timings are the same as the current timings. */
+ if (v4l2_match_dv_timings(timings, &skel->timings, 0, false))
+ return 0;
+
+ /*
+ * Changing the timings implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ /* TODO: Configure new timings */
+
+ /* Save timings */
+ skel->timings = *timings;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_dv_timings(struct file *file, void *priv,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* G_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ *timings = skel->timings;
+ return 0;
+}
+
+static int skeleton_enum_dv_timings(struct file *file, void *priv,
+ struct v4l2_enum_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* ENUM_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+ return v4l2_enum_dv_timings_cap(timings, &skel_timings_cap,
+ NULL, NULL);
+}
+
+/*
+ * Query the current timings as seen by the hardware. This function shall
+ * never actually change the timings, it just detects and reports.
+ * If no signal is detected, then return -ENOLINK. If the hardware cannot
+ * lock to the signal, then return -ENOLCK. If the signal is out of range
+ * of the capabilities of the system (e.g., it is possible that the receiver
+ * can lock but that the DMA engine it is connected to cannot handle
+ * pixelclocks above a certain frequency), then -ERANGE is returned.
+ */
+static int skeleton_query_dv_timings(struct file *file, void *priv,
+ struct v4l2_dv_timings *timings)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* QUERY_DV_TIMINGS is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+
+#ifdef TODO
+ /*
+ * Query currently seen timings. This function should look
+ * something like this:
+ */
+ detect_timings();
+ if (no_signal)
+ return -ENOLINK;
+ if (cannot_lock_to_signal)
+ return -ENOLCK;
+ if (signal_out_of_range_of_capabilities)
+ return -ERANGE;
+
+ /* Useful for debugging */
+ v4l2_print_dv_timings(skel->v4l2_dev.name, "query_dv_timings:",
+ timings, true);
+#endif
+ return 0;
+}
+
+static int skeleton_dv_timings_cap(struct file *file, void *priv,
+ struct v4l2_dv_timings_cap *cap)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ /* DV_TIMINGS_CAP is not supported on the S-Video input */
+ if (skel->input == 0)
+ return -ENODATA;
+ *cap = skel_timings_cap;
+ return 0;
+}
+
+static int skeleton_enum_input(struct file *file, void *priv,
+ struct v4l2_input *i)
+{
+ if (i->index > 1)
+ return -EINVAL;
+
+ i->type = V4L2_INPUT_TYPE_CAMERA;
+ if (i->index == 0) {
+ i->std = SKEL_TVNORMS;
+ strscpy(i->name, "S-Video", sizeof(i->name));
+ i->capabilities = V4L2_IN_CAP_STD;
+ } else {
+ i->std = 0;
+ strscpy(i->name, "HDMI", sizeof(i->name));
+ i->capabilities = V4L2_IN_CAP_DV_TIMINGS;
+ }
+ return 0;
+}
+
+static int skeleton_s_input(struct file *file, void *priv, unsigned int i)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ if (i > 1)
+ return -EINVAL;
+
+ /*
+ * Changing the input implies a format change, which is not allowed
+ * while buffers for use with streaming have already been allocated.
+ */
+ if (vb2_is_busy(&skel->queue))
+ return -EBUSY;
+
+ skel->input = i;
+ /*
+ * Update tvnorms. The tvnorms value is used by the core to implement
+ * VIDIOC_ENUMSTD so it has to be correct. If tvnorms == 0, then
+ * ENUMSTD will return -ENODATA.
+ */
+ skel->vdev.tvnorms = i ? 0 : SKEL_TVNORMS;
+
+ /* Update the internal format */
+ skeleton_fill_pix_format(skel, &skel->format);
+ return 0;
+}
+
+static int skeleton_g_input(struct file *file, void *priv, unsigned int *i)
+{
+ struct skeleton *skel = video_drvdata(file);
+
+ *i = skel->input;
+ return 0;
+}
+
+/* The control handler. */
+static int skeleton_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+ /*struct skeleton *skel =
+ container_of(ctrl->handler, struct skeleton, ctrl_handler);*/
+
+ switch (ctrl->id) {
+ case V4L2_CID_BRIGHTNESS:
+ /* TODO: set brightness to ctrl->val */
+ break;
+ case V4L2_CID_CONTRAST:
+ /* TODO: set contrast to ctrl->val */
+ break;
+ case V4L2_CID_SATURATION:
+ /* TODO: set saturation to ctrl->val */
+ break;
+ case V4L2_CID_HUE:
+ /* TODO: set hue to ctrl->val */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* ------------------------------------------------------------------
+ File operations for the device
+ ------------------------------------------------------------------*/
+
+static const struct v4l2_ctrl_ops skel_ctrl_ops = {
+ .s_ctrl = skeleton_s_ctrl,
+};
+
+/*
+ * The set of all supported ioctls. Note that all the streaming ioctls
+ * use the vb2 helper functions that take care of all the locking and
+ * that also do ownership tracking (i.e. only the filehandle that requested
+ * the buffers can call the streaming ioctls, all other filehandles will
+ * receive -EBUSY if they attempt to call the same streaming ioctls).
+ *
+ * The last three ioctls also use standard helper functions: these implement
+ * standard behavior for drivers with controls.
+ */
+static const struct v4l2_ioctl_ops skel_ioctl_ops = {
+ .vidioc_querycap = skeleton_querycap,
+ .vidioc_try_fmt_vid_cap = skeleton_try_fmt_vid_cap,
+ .vidioc_s_fmt_vid_cap = skeleton_s_fmt_vid_cap,
+ .vidioc_g_fmt_vid_cap = skeleton_g_fmt_vid_cap,
+ .vidioc_enum_fmt_vid_cap = skeleton_enum_fmt_vid_cap,
+
+ .vidioc_g_std = skeleton_g_std,
+ .vidioc_s_std = skeleton_s_std,
+ .vidioc_querystd = skeleton_querystd,
+
+ .vidioc_s_dv_timings = skeleton_s_dv_timings,
+ .vidioc_g_dv_timings = skeleton_g_dv_timings,
+ .vidioc_enum_dv_timings = skeleton_enum_dv_timings,
+ .vidioc_query_dv_timings = skeleton_query_dv_timings,
+ .vidioc_dv_timings_cap = skeleton_dv_timings_cap,
+
+ .vidioc_enum_input = skeleton_enum_input,
+ .vidioc_g_input = skeleton_g_input,
+ .vidioc_s_input = skeleton_s_input,
+
+ .vidioc_reqbufs = vb2_ioctl_reqbufs,
+ .vidioc_create_bufs = vb2_ioctl_create_bufs,
+ .vidioc_querybuf = vb2_ioctl_querybuf,
+ .vidioc_qbuf = vb2_ioctl_qbuf,
+ .vidioc_dqbuf = vb2_ioctl_dqbuf,
+ .vidioc_expbuf = vb2_ioctl_expbuf,
+ .vidioc_streamon = vb2_ioctl_streamon,
+ .vidioc_streamoff = vb2_ioctl_streamoff,
+
+ .vidioc_log_status = v4l2_ctrl_log_status,
+ .vidioc_subscribe_event = v4l2_ctrl_subscribe_event,
+ .vidioc_unsubscribe_event = v4l2_event_unsubscribe,
+};
+
+/*
+ * The set of file operations. Note that all these ops are standard core
+ * helper functions.
+ */
+static const struct v4l2_file_operations skel_fops = {
+ .owner = THIS_MODULE,
+ .open = v4l2_fh_open,
+ .release = vb2_fop_release,
+ .unlocked_ioctl = video_ioctl2,
+ .read = vb2_fop_read,
+ .mmap = vb2_fop_mmap,
+ .poll = vb2_fop_poll,
+};
+
+/*
+ * The initial setup of this device instance. Note that the initial state of
+ * the driver should be complete. So the initial format, standard, timings
+ * and video input should all be initialized to some reasonable value.
+ */
+static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ /* The initial timings are chosen to be 720p60. */
+ static const struct v4l2_dv_timings timings_def =
+ V4L2_DV_BT_CEA_1280X720P60;
+ struct skeleton *skel;
+ struct video_device *vdev;
+ struct v4l2_ctrl_handler *hdl;
+ struct vb2_queue *q;
+ int ret;
+
+ /* Enable PCI */
+ ret = pci_enable_device(pdev);
+ if (ret)
+ return ret;
+ ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_err(&pdev->dev, "no suitable DMA available.\n");
+ goto disable_pci;
+ }
+
+ /* Allocate a new instance */
+ skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL);
+ if (!skel) {
+ ret = -ENOMEM;
+ goto disable_pci;
+ }
+
+ /* Allocate the interrupt */
+ ret = devm_request_irq(&pdev->dev, pdev->irq,
+ skeleton_irq, 0, KBUILD_MODNAME, skel);
+ if (ret) {
+ dev_err(&pdev->dev, "request_irq failed\n");
+ goto disable_pci;
+ }
+ skel->pdev = pdev;
+
+ /* Fill in the initial format-related settings */
+ skel->timings = timings_def;
+ skel->std = V4L2_STD_625_50;
+ skeleton_fill_pix_format(skel, &skel->format);
+
+ /* Initialize the top-level structure */
+ ret = v4l2_device_register(&pdev->dev, &skel->v4l2_dev);
+ if (ret)
+ goto disable_pci;
+
+ mutex_init(&skel->lock);
+
+ /* Add the controls */
+ hdl = &skel->ctrl_handler;
+ v4l2_ctrl_handler_init(hdl, 4);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_BRIGHTNESS, 0, 255, 1, 127);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_CONTRAST, 0, 255, 1, 16);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_SATURATION, 0, 255, 1, 127);
+ v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
+ V4L2_CID_HUE, -128, 127, 1, 0);
+ if (hdl->error) {
+ ret = hdl->error;
+ goto free_hdl;
+ }
+ skel->v4l2_dev.ctrl_handler = hdl;
+
+ /* Initialize the vb2 queue */
+ q = &skel->queue;
+ q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+ q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ;
+ q->dev = &pdev->dev;
+ q->drv_priv = skel;
+ q->buf_struct_size = sizeof(struct skel_buffer);
+ q->ops = &skel_qops;
+ q->mem_ops = &vb2_dma_contig_memops;
+ q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+ /*
+ * Assume that this DMA engine needs to have at least two buffers
+ * available before it can be started. The start_streaming() op
+ * won't be called until at least this many buffers are queued up.
+ */
+ q->min_queued_buffers = 2;
+ /*
+ * The serialization lock for the streaming ioctls. This is the same
+ * as the main serialization lock, but if some of the non-streaming
+ * ioctls could take a long time to execute, then you might want to
+ * have a different lock here to prevent VIDIOC_DQBUF from being
+ * blocked while waiting for another action to finish. This is
+ * generally not needed for PCI devices, but USB devices usually do
+ * want a separate lock here.
+ */
+ q->lock = &skel->lock;
+ /*
+ * Since this driver can only do 32-bit DMA we must make sure that
+ * the vb2 core will allocate the buffers in 32-bit DMA memory.
+ */
+ q->gfp_flags = GFP_DMA32;
+ ret = vb2_queue_init(q);
+ if (ret)
+ goto free_hdl;
+
+ INIT_LIST_HEAD(&skel->buf_list);
+ spin_lock_init(&skel->qlock);
+
+ /* Initialize the video_device structure */
+ vdev = &skel->vdev;
+ strscpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name));
+ /*
+ * There is nothing to clean up, so release is set to an empty release
+ * function. The release callback must be non-NULL.
+ */
+ vdev->release = video_device_release_empty;
+ vdev->fops = &skel_fops,
+ vdev->ioctl_ops = &skel_ioctl_ops,
+ vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE |
+ V4L2_CAP_STREAMING;
+ /*
+ * The main serialization lock. All ioctls are serialized by this
+ * lock. Exception: if q->lock is set, then the streaming ioctls
+ * are serialized by that separate lock.
+ */
+ vdev->lock = &skel->lock;
+ vdev->queue = q;
+ vdev->v4l2_dev = &skel->v4l2_dev;
+ /* Supported SDTV standards, if any */
+ vdev->tvnorms = SKEL_TVNORMS;
+ video_set_drvdata(vdev, skel);
+
+ ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
+ if (ret)
+ goto free_hdl;
+
+ dev_info(&pdev->dev, "V4L2 PCI Skeleton Driver loaded\n");
+ return 0;
+
+free_hdl:
+ v4l2_ctrl_handler_free(&skel->ctrl_handler);
+ v4l2_device_unregister(&skel->v4l2_dev);
+disable_pci:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static void skeleton_remove(struct pci_dev *pdev)
+{
+ struct v4l2_device *v4l2_dev = pci_get_drvdata(pdev);
+ struct skeleton *skel = container_of(v4l2_dev, struct skeleton, v4l2_dev);
+
+ video_unregister_device(&skel->vdev);
+ v4l2_ctrl_handler_free(&skel->ctrl_handler);
+ v4l2_device_unregister(&skel->v4l2_dev);
+ pci_disable_device(skel->pdev);
+}
+
+static struct pci_driver skeleton_driver = {
+ .name = KBUILD_MODNAME,
+ .probe = skeleton_probe,
+ .remove = skeleton_remove,
+ .id_table = skeleton_pci_tbl,
+};
+
+module_pci_driver(skeleton_driver);
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
new file mode 100644
index 000000000000..10d179c4fdeb
--- /dev/null
+++ b/samples/vfio-mdev/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
+obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o
diff --git a/samples/vfio-mdev/README.rst b/samples/vfio-mdev/README.rst
new file mode 100644
index 000000000000..b52eb37739c0
--- /dev/null
+++ b/samples/vfio-mdev/README.rst
@@ -0,0 +1,100 @@
+Using the mtty vfio-mdev sample code
+====================================
+
+mtty is a sample vfio-mdev driver that demonstrates how to use the mediated
+device framework.
+
+The sample driver creates an mdev device that simulates a serial port over a PCI
+card.
+
+1. Build and load the mtty.ko module.
+
+ This step creates a dummy device, /sys/devices/virtual/mtty/mtty/
+
+ Files in this device directory in sysfs are similar to the following::
+
+ # tree /sys/devices/virtual/mtty/mtty/
+ /sys/devices/virtual/mtty/mtty/
+ |-- mdev_supported_types
+ | |-- mtty-1
+ | | |-- available_instances
+ | | |-- create
+ | | |-- device_api
+ | | |-- devices
+ | | `-- name
+ | `-- mtty-2
+ | |-- available_instances
+ | |-- create
+ | |-- device_api
+ | |-- devices
+ | `-- name
+ |-- mtty_dev
+ | `-- sample_mtty_dev
+ |-- power
+ | |-- autosuspend_delay_ms
+ | |-- control
+ | |-- runtime_active_time
+ | |-- runtime_status
+ | `-- runtime_suspended_time
+ |-- subsystem -> ../../../../class/mtty
+ `-- uevent
+
+2. Create a mediated device by using the dummy device that you created in the
+ previous step::
+
+ # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \
+ /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
+
+3. Add parameters to qemu-kvm::
+
+ -device vfio-pci,\
+ sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
+
+4. Boot the VM.
+
+ In the Linux guest VM, with no hardware on the host, the device appears
+ as follows::
+
+ # lspci -s 00:05.0 -xxvv
+ 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550])
+ Subsystem: Device 4348:3253
+ Physical Slot: 5
+ Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr-
+ Stepping- SERR- FastB2B- DisINTx-
+ Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort-
+ <TAbort- <MAbort- >SERR- <PERR- INTx-
+ Interrupt: pin A routed to IRQ 10
+ Region 0: I/O ports at c150 [size=8]
+ Region 1: I/O ports at c158 [size=8]
+ Kernel driver in use: serial
+ 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00
+ 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00
+ 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32
+ 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00
+
+ In the Linux guest VM, dmesg output for the device is as follows:
+
+ serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10
+ 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A
+ 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A
+
+
+5. In the Linux guest VM, check the serial ports::
+
+ # setserial -g /dev/ttyS*
+ /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4
+ /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10
+ /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10
+
+6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or
+ /dev/ttyS2 with hardware flow control disabled.
+
+7. Type data on the minicom terminal or send data to the terminal emulation
+ program and read the data.
+
+ Data is loop backed from hosts mtty driver.
+
+8. Destroy the mediated device that you created::
+
+ # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove
+
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
new file mode 100644
index 000000000000..64ea19253ee3
--- /dev/null
+++ b/samples/vfio-mdev/mbochs.c
@@ -0,0 +1,1451 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is
+ * basically the vram memory bar and the bochs dispi interface vbe
+ * registers in the mmio register bar. Specifically it does *not*
+ * include any legacy vga stuff. Device looks a lot like "qemu -device
+ * secondary-vga".
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/dma-buf.h>
+#include <linux/highmem.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_rect.h>
+#include <drm/drm_modeset_lock.h>
+#include <drm/drm_property.h>
+#include <drm/drm_plane.h>
+
+
+#define VBE_DISPI_INDEX_ID 0x0
+#define VBE_DISPI_INDEX_XRES 0x1
+#define VBE_DISPI_INDEX_YRES 0x2
+#define VBE_DISPI_INDEX_BPP 0x3
+#define VBE_DISPI_INDEX_ENABLE 0x4
+#define VBE_DISPI_INDEX_BANK 0x5
+#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6
+#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7
+#define VBE_DISPI_INDEX_X_OFFSET 0x8
+#define VBE_DISPI_INDEX_Y_OFFSET 0x9
+#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
+#define VBE_DISPI_INDEX_COUNT 0xb
+
+#define VBE_DISPI_ID0 0xB0C0
+#define VBE_DISPI_ID1 0xB0C1
+#define VBE_DISPI_ID2 0xB0C2
+#define VBE_DISPI_ID3 0xB0C3
+#define VBE_DISPI_ID4 0xB0C4
+#define VBE_DISPI_ID5 0xB0C5
+
+#define VBE_DISPI_DISABLED 0x00
+#define VBE_DISPI_ENABLED 0x01
+#define VBE_DISPI_GETCAPS 0x02
+#define VBE_DISPI_8BIT_DAC 0x20
+#define VBE_DISPI_LFB_ENABLED 0x40
+#define VBE_DISPI_NOCLEARMEM 0x80
+
+
+#define MBOCHS_NAME "mbochs"
+#define MBOCHS_CLASS_NAME "mbochs"
+
+#define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS
+#define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1)
+
+#define MBOCHS_CONFIG_SPACE_SIZE 0xff
+#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE
+#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE
+#define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \
+ MBOCHS_MMIO_BAR_SIZE)
+#define MBOCHS_EDID_SIZE PAGE_SIZE
+#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \
+ MBOCHS_EDID_SIZE)
+
+#define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2)
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_DESCRIPTION("Mediated virtual PCI display host device driver");
+MODULE_LICENSE("GPL v2");
+
+static int max_mbytes = 256;
+module_param_named(count, max_mbytes, int, 0444);
+MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
+
+
+#define MBOCHS_TYPE_1 "small"
+#define MBOCHS_TYPE_2 "medium"
+#define MBOCHS_TYPE_3 "large"
+
+static struct mbochs_type {
+ struct mdev_type type;
+ u32 mbytes;
+ u32 max_x;
+ u32 max_y;
+} mbochs_types[] = {
+ {
+ .type.sysfs_name = MBOCHS_TYPE_1,
+ .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
+ .mbytes = 4,
+ .max_x = 800,
+ .max_y = 600,
+ }, {
+ .type.sysfs_name = MBOCHS_TYPE_2,
+ .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
+ .mbytes = 16,
+ .max_x = 1920,
+ .max_y = 1440,
+ }, {
+ .type.sysfs_name = MBOCHS_TYPE_3,
+ .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
+ .mbytes = 64,
+ .max_x = 0,
+ .max_y = 0,
+ },
+};
+
+static struct mdev_type *mbochs_mdev_types[] = {
+ &mbochs_types[0].type,
+ &mbochs_types[1].type,
+ &mbochs_types[2].type,
+};
+
+static dev_t mbochs_devt;
+static const struct class mbochs_class = {
+ .name = MBOCHS_CLASS_NAME,
+};
+static struct cdev mbochs_cdev;
+static struct device mbochs_dev;
+static struct mdev_parent mbochs_parent;
+static atomic_t mbochs_avail_mbytes;
+static const struct vfio_device_ops mbochs_dev_ops;
+
+struct mbochs_mode {
+ u32 drm_format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+ u32 stride;
+ u32 __pad;
+ u64 offset;
+ u64 size;
+};
+
+struct mbochs_dmabuf {
+ struct mbochs_mode mode;
+ u32 id;
+ struct page **pages;
+ pgoff_t pagecount;
+ struct dma_buf *buf;
+ struct mdev_state *mdev_state;
+ struct list_head next;
+ bool unlinked;
+};
+
+/* State of each mdev device */
+struct mdev_state {
+ struct vfio_device vdev;
+ u8 *vconfig;
+ u64 bar_mask[3];
+ u32 memory_bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+
+ const struct mbochs_type *type;
+ u16 vbe[VBE_DISPI_INDEX_COUNT];
+ u64 memsize;
+ struct page **pages;
+ pgoff_t pagecount;
+ struct vfio_region_gfx_edid edid_regs;
+ u8 edid_blob[0x400];
+
+ struct list_head dmabufs;
+ u32 active_id;
+ u32 next_id;
+};
+
+static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
+ [VBE_DISPI_INDEX_ID] = "id",
+ [VBE_DISPI_INDEX_XRES] = "xres",
+ [VBE_DISPI_INDEX_YRES] = "yres",
+ [VBE_DISPI_INDEX_BPP] = "bpp",
+ [VBE_DISPI_INDEX_ENABLE] = "enable",
+ [VBE_DISPI_INDEX_BANK] = "bank",
+ [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width",
+ [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height",
+ [VBE_DISPI_INDEX_X_OFFSET] = "x-offset",
+ [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset",
+ [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem",
+};
+
+static const char *vbe_name(u32 index)
+{
+ if (index < ARRAY_SIZE(vbe_name_list))
+ return vbe_name_list[index];
+ return "(invalid)";
+}
+
+static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff);
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff);
+
+static void mbochs_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ 0x1234);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ 0x1111);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ PCI_SUBDEVICE_ID_QEMU);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32);
+ mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
+}
+
+static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u16 *vbe = mdev_state->vbe;
+ u32 virt_width;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
+ goto nofb;
+
+ memset(mode, 0, sizeof(*mode));
+ switch (vbe[VBE_DISPI_INDEX_BPP]) {
+ case 32:
+ mode->drm_format = DRM_FORMAT_XRGB8888;
+ mode->bytepp = 4;
+ break;
+ default:
+ dev_info_ratelimited(dev, "%s: bpp %d not supported\n",
+ __func__, vbe[VBE_DISPI_INDEX_BPP]);
+ goto nofb;
+ }
+
+ mode->width = vbe[VBE_DISPI_INDEX_XRES];
+ mode->height = vbe[VBE_DISPI_INDEX_YRES];
+ virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
+ if (virt_width < mode->width)
+ virt_width = mode->width;
+ mode->stride = virt_width * mode->bytepp;
+ mode->size = (u64)mode->stride * mode->height;
+ mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
+ (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
+
+ if (mode->width < 64 || mode->height < 64) {
+ dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
+ __func__, mode->width, mode->height);
+ goto nofb;
+ }
+ if (mode->offset + mode->size > mdev_state->memsize) {
+ dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
+ __func__);
+ goto nofb;
+ }
+
+ return 0;
+
+nofb:
+ memset(mode, 0, sizeof(*mode));
+ return -EINVAL;
+}
+
+static bool mbochs_modes_equal(struct mbochs_mode *mode1,
+ struct mbochs_mode *mode2)
+{
+ return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_2:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR #%d @ 0x%x\n",
+ index, cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int index;
+ u16 reg16;
+
+ switch (offset) {
+ case 0x400 ... 0x41f: /* vga ioports remapped */
+ goto unhandled;
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ reg16 = *(u16 *)buf;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ mdev_state->vbe[index] = reg16;
+ dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ break;
+ case 0x600 ... 0x607: /* qemu extended regs */
+ goto unhandled;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ break;
+ }
+}
+
+static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ struct vfio_region_gfx_edid *edid;
+ u16 reg16 = 0;
+ int index;
+
+ switch (offset) {
+ case 0x000 ... 0x3ff: /* edid block */
+ edid = &mdev_state->edid_regs;
+ if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP ||
+ offset >= edid->edid_size) {
+ memset(buf, 0, count);
+ break;
+ }
+ memcpy(buf, mdev_state->edid_blob + offset, count);
+ break;
+ case 0x500 ... 0x515: /* bochs dispi interface */
+ if (count != 2)
+ goto unhandled;
+ index = (offset - 0x500) / 2;
+ if (index < ARRAY_SIZE(mdev_state->vbe))
+ reg16 = mdev_state->vbe[index];
+ dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
+ __func__, index, reg16, vbe_name(index));
+ *(u16 *)buf = reg16;
+ break;
+ default:
+unhandled:
+ dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
+ __func__, offset, count);
+ memset(buf, 0, count);
+ break;
+ }
+}
+
+static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count, bool is_write)
+{
+ char *regs = (void *)&mdev_state->edid_regs;
+
+ if (offset + count > sizeof(mdev_state->edid_regs))
+ return;
+ if (count != 4)
+ return;
+ if (offset % 4)
+ return;
+
+ if (is_write) {
+ switch (offset) {
+ case offsetof(struct vfio_region_gfx_edid, link_state):
+ case offsetof(struct vfio_region_gfx_edid, edid_size):
+ memcpy(regs + offset, buf, count);
+ break;
+ default:
+ /* read-only regs */
+ break;
+ }
+ } else {
+ memcpy(buf, regs + offset, count);
+ }
+}
+
+static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count, bool is_write)
+{
+ if (offset + count > mdev_state->edid_regs.edid_max_size)
+ return;
+ if (is_write)
+ memcpy(mdev_state->edid_blob + offset, buf, count);
+ else
+ memcpy(buf, mdev_state->edid_blob + offset, count);
+}
+
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+ size_t count, loff_t pos, bool is_write)
+{
+ struct page *pg;
+ loff_t poff;
+ char *map;
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
+ pos + count <= (MBOCHS_MMIO_BAR_OFFSET +
+ MBOCHS_MMIO_BAR_SIZE)) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ if (is_write)
+ handle_mmio_write(mdev_state, pos, buf, count);
+ else
+ handle_mmio_read(mdev_state, pos, buf, count);
+
+ } else if (pos >= MBOCHS_EDID_OFFSET &&
+ pos + count <= (MBOCHS_EDID_OFFSET +
+ MBOCHS_EDID_SIZE)) {
+ pos -= MBOCHS_EDID_OFFSET;
+ if (pos < MBOCHS_EDID_BLOB_OFFSET) {
+ handle_edid_regs(mdev_state, pos, buf, count, is_write);
+ } else {
+ pos -= MBOCHS_EDID_BLOB_OFFSET;
+ handle_edid_blob(mdev_state, pos, buf, count, is_write);
+ }
+
+ } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET &&
+ pos + count <=
+ MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) {
+ pos -= MBOCHS_MMIO_BAR_OFFSET;
+ poff = pos & ~PAGE_MASK;
+ pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT);
+ map = kmap(pg);
+ if (is_write)
+ memcpy(map + poff, buf, count);
+ else
+ memcpy(buf, map + poff, count);
+ kunmap(pg);
+ put_page(pg);
+
+ } else {
+ dev_dbg(mdev_state->vdev.dev, "%s: %s @0x%llx (unhandled)\n",
+ __func__, is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mbochs_reset(struct mdev_state *mdev_state)
+{
+ u32 size64k = mdev_state->memsize / (64 * 1024);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
+ mdev_state->vbe[i] = 0;
+ mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
+ mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
+ return 0;
+}
+
+static int mbochs_init_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct mdev_device *mdev = to_mdev_device(vdev->dev);
+ struct mbochs_type *type =
+ container_of(mdev->type, struct mbochs_type, type);
+ int avail_mbytes = atomic_read(&mbochs_avail_mbytes);
+ int ret = -ENOMEM;
+
+ do {
+ if (avail_mbytes < type->mbytes)
+ return -ENOSPC;
+ } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes,
+ avail_mbytes - type->mbytes));
+
+ mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (!mdev_state->vconfig)
+ goto err_avail;
+
+ mdev_state->memsize = type->mbytes * 1024 * 1024;
+ mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
+ mdev_state->pages = kcalloc(mdev_state->pagecount,
+ sizeof(struct page *),
+ GFP_KERNEL);
+ if (!mdev_state->pages)
+ goto err_vconfig;
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ INIT_LIST_HEAD(&mdev_state->dmabufs);
+ mdev_state->next_id = 1;
+
+ mdev_state->type = type;
+ mdev_state->edid_regs.max_xres = type->max_x;
+ mdev_state->edid_regs.max_yres = type->max_y;
+ mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET;
+ mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob);
+ mbochs_create_config_space(mdev_state);
+ mbochs_reset(mdev_state);
+
+ dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__,
+ type->type.pretty_name, type->mbytes, mdev_state->pagecount);
+ return 0;
+
+err_vconfig:
+ kfree(mdev_state->vconfig);
+err_avail:
+ atomic_add(type->mbytes, &mbochs_avail_mbytes);
+ return ret;
+}
+
+static int mbochs_probe(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state;
+ int ret = -ENOMEM;
+
+ mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev,
+ &mbochs_dev_ops);
+ if (IS_ERR(mdev_state))
+ return PTR_ERR(mdev_state);
+
+ ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev);
+ if (ret)
+ goto err_put_vdev;
+ dev_set_drvdata(&mdev->dev, mdev_state);
+ return 0;
+
+err_put_vdev:
+ vfio_put_device(&mdev_state->vdev);
+ return ret;
+}
+
+static void mbochs_release_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes);
+ kfree(mdev_state->pages);
+ kfree(mdev_state->vconfig);
+}
+
+static void mbochs_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
+
+ vfio_unregister_group_dev(&mdev_state->vdev);
+ vfio_put_device(&mdev_state->vdev);
+}
+
+static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mbochs_write(struct vfio_device *vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!mdev_state->pages[pgoff]) {
+ mdev_state->pages[pgoff] =
+ alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
+ if (!mdev_state->pages[pgoff])
+ return NULL;
+ }
+
+ get_page(mdev_state->pages[pgoff]);
+ return mdev_state->pages[pgoff];
+}
+
+static struct page *mbochs_get_page(struct mdev_state *mdev_state,
+ pgoff_t pgoff)
+{
+ struct page *page;
+
+ if (WARN_ON(pgoff >= mdev_state->pagecount))
+ return NULL;
+
+ mutex_lock(&mdev_state->ops_lock);
+ page = __mbochs_get_page(mdev_state, pgoff);
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return page;
+}
+
+static void mbochs_put_pages(struct mdev_state *mdev_state)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ int i, count = 0;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ for (i = 0; i < mdev_state->pagecount; i++) {
+ if (!mdev_state->pages[i])
+ continue;
+ put_page(mdev_state->pages[i]);
+ mdev_state->pages[i] = NULL;
+ count++;
+ }
+ dev_dbg(dev, "%s: %d pages released\n", __func__, count);
+}
+
+static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mdev_state *mdev_state = vma->vm_private_data;
+ pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+
+ if (page_offset >= mdev_state->pagecount)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = mbochs_get_page(mdev_state, page_offset);
+ if (!vmf->page)
+ return VM_FAULT_SIGBUS;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_region_vm_ops = {
+ .fault = mbochs_region_vm_fault,
+};
+
+static int mbochs_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_region_vm_ops;
+ vma->vm_private_data = mdev_state;
+ return 0;
+}
+
+static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
+
+ if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = dmabuf->pages[vmf->pgoff];
+ get_page(vmf->page);
+ return 0;
+}
+
+static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
+ .fault = mbochs_dmabuf_vm_fault,
+};
+
+static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ vma->vm_ops = &mbochs_dmabuf_vm_ops;
+ vma->vm_private_data = dmabuf;
+ return 0;
+}
+
+static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
+ const char *prefix)
+{
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ u32 fourcc = dmabuf->mode.drm_format;
+
+ dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
+ prefix, dmabuf->id,
+ fourcc ? ((fourcc >> 0) & 0xff) : '-',
+ fourcc ? ((fourcc >> 8) & 0xff) : '-',
+ fourcc ? ((fourcc >> 16) & 0xff) : '-',
+ fourcc ? ((fourcc >> 24) & 0xff) : '-',
+ dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
+ dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
+}
+
+static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+ struct sg_table *sg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ sg = kzalloc(sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ goto err1;
+ if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
+ 0, dmabuf->mode.size, GFP_KERNEL) < 0)
+ goto err2;
+ if (dma_map_sgtable(at->dev, sg, direction, 0))
+ goto err3;
+
+ return sg;
+
+err3:
+ sg_free_table(sg);
+err2:
+ kfree(sg);
+err1:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
+ struct sg_table *sg,
+ enum dma_data_direction direction)
+{
+ struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
+ struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ dma_unmap_sgtable(at->dev, sg, direction, 0);
+ sg_free_table(sg);
+ kfree(sg);
+}
+
+static void mbochs_release_dmabuf(struct dma_buf *buf)
+{
+ struct mbochs_dmabuf *dmabuf = buf->priv;
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ pgoff_t pg;
+
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+
+ for (pg = 0; pg < dmabuf->pagecount; pg++)
+ put_page(dmabuf->pages[pg]);
+
+ mutex_lock(&mdev_state->ops_lock);
+ dmabuf->buf = NULL;
+ if (dmabuf->unlinked)
+ kfree(dmabuf);
+ mutex_unlock(&mdev_state->ops_lock);
+}
+
+static struct dma_buf_ops mbochs_dmabuf_ops = {
+ .map_dma_buf = mbochs_map_dmabuf,
+ .unmap_dma_buf = mbochs_unmap_dmabuf,
+ .release = mbochs_release_dmabuf,
+ .mmap = mbochs_mmap_dmabuf,
+};
+
+static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+ pgoff_t page_offset, pg;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
+ if (!dmabuf)
+ return NULL;
+
+ dmabuf->mode = *mode;
+ dmabuf->id = mdev_state->next_id++;
+ dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
+ dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!dmabuf->pages)
+ goto err_free_dmabuf;
+
+ page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
+ for (pg = 0; pg < dmabuf->pagecount; pg++) {
+ dmabuf->pages[pg] = __mbochs_get_page(mdev_state,
+ page_offset + pg);
+ if (!dmabuf->pages[pg])
+ goto err_free_pages;
+ }
+
+ dmabuf->mdev_state = mdev_state;
+ list_add(&dmabuf->next, &mdev_state->dmabufs);
+
+ mbochs_print_dmabuf(dmabuf, __func__);
+ return dmabuf;
+
+err_free_pages:
+ while (pg > 0)
+ put_page(dmabuf->pages[--pg]);
+ kfree(dmabuf->pages);
+err_free_dmabuf:
+ kfree(dmabuf);
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
+ struct mbochs_mode *mode)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (mbochs_modes_equal(&dmabuf->mode, mode))
+ return dmabuf;
+
+ return NULL;
+}
+
+static struct mbochs_dmabuf *
+mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
+ if (dmabuf->id == id)
+ return dmabuf;
+
+ return NULL;
+}
+
+static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
+{
+ struct mdev_state *mdev_state = dmabuf->mdev_state;
+ struct device *dev = mdev_state->vdev.dev;
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct dma_buf *buf;
+
+ WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
+
+ if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
+ dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ exp_info.ops = &mbochs_dmabuf_ops;
+ exp_info.size = dmabuf->mode.size;
+ exp_info.priv = dmabuf;
+
+ buf = dma_buf_export(&exp_info);
+ if (IS_ERR(buf)) {
+ dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
+ __func__, PTR_ERR(buf));
+ return PTR_ERR(buf);
+ }
+
+ dmabuf->buf = buf;
+ dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
+ return 0;
+}
+
+static int mbochs_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ if (region_info->index >= MBOCHS_NUM_REGIONS)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MBOCHS_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ case VFIO_PCI_BAR2_REGION_INDEX:
+ region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
+ region_info->size = MBOCHS_MMIO_BAR_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case MBOCHS_EDID_REGION_INDEX: {
+ struct vfio_region_info_cap_type cap_type = {
+ .header.id = VFIO_REGION_INFO_CAP_TYPE,
+ .header.version = 1,
+ .type = VFIO_REGION_TYPE_GFX,
+ .subtype = VFIO_REGION_SUBTYPE_GFX_EDID,
+ };
+
+ region_info->offset = MBOCHS_EDID_OFFSET;
+ region_info->size = MBOCHS_EDID_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_CAPS);
+
+ return vfio_info_add_capability(caps, &cap_type.header,
+ sizeof(cap_type));
+ }
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mbochs_get_irq_info(struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mbochs_get_device_info(struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = MBOCHS_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mbochs_query_gfx_plane(struct mdev_state *mdev_state,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ struct mbochs_dmabuf *dmabuf;
+ struct mbochs_mode mode;
+ int ret;
+
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_DMABUF))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
+ return -EINVAL;
+
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ ret = -EINVAL;
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
+ ret = mbochs_check_framebuffer(mdev_state, &mode);
+ if (ret < 0) {
+ plane->drm_format = 0;
+ plane->width = 0;
+ plane->height = 0;
+ plane->stride = 0;
+ plane->size = 0;
+ plane->dmabuf_id = 0;
+ goto done;
+ }
+
+ dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
+ if (!dmabuf)
+ mbochs_dmabuf_alloc(mdev_state, &mode);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOMEM;
+ }
+
+ plane->drm_format = dmabuf->mode.drm_format;
+ plane->width = dmabuf->mode.width;
+ plane->height = dmabuf->mode.height;
+ plane->stride = dmabuf->mode.stride;
+ plane->size = dmabuf->mode.size;
+ plane->dmabuf_id = dmabuf->id;
+
+done:
+ if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
+ mdev_state->active_id != plane->dmabuf_id) {
+ dev_dbg(mdev_state->vdev.dev, "%s: primary: %d => %d\n",
+ __func__, mdev_state->active_id, plane->dmabuf_id);
+ mdev_state->active_id = plane->dmabuf_id;
+ }
+ mutex_unlock(&mdev_state->ops_lock);
+ return 0;
+}
+
+static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id)
+{
+ struct mbochs_dmabuf *dmabuf;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
+ if (!dmabuf) {
+ mutex_unlock(&mdev_state->ops_lock);
+ return -ENOENT;
+ }
+
+ if (!dmabuf->buf)
+ mbochs_dmabuf_export(dmabuf);
+
+ mutex_unlock(&mdev_state->ops_lock);
+
+ if (!dmabuf->buf)
+ return -EINVAL;
+
+ return dma_buf_fd(dmabuf->buf, 0);
+}
+
+static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ int ret = 0;
+ unsigned long minsz;
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_get_device_info(&info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= VFIO_PCI_NUM_IRQS))
+ return -EINVAL;
+
+ ret = mbochs_get_irq_info(&info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane = {};
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mbochs_query_gfx_plane(mdev_state, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_GFX_DMABUF:
+ {
+ u32 dmabuf_id;
+
+ if (get_user(dmabuf_id, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return mbochs_get_gfx_dmabuf(mdev_state, dmabuf_id);
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mbochs_reset(mdev_state);
+ }
+ return -ENOTTY;
+}
+
+static void mbochs_close_device(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct mbochs_dmabuf *dmabuf, *tmp;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
+ list_del(&dmabuf->next);
+ if (dmabuf->buf) {
+ /* free in mbochs_release_dmabuf() */
+ dmabuf->unlinked = true;
+ } else {
+ kfree(dmabuf);
+ }
+ }
+ mbochs_put_pages(mdev_state);
+
+ mutex_unlock(&mdev_state->ops_lock);
+}
+
+static ssize_t
+memory_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_state *mdev_state = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
+}
+static DEVICE_ATTR_RO(memory);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_memory.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+static const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t mbochs_show_description(struct mdev_type *mtype, char *buf)
+{
+ struct mbochs_type *type =
+ container_of(mtype, struct mbochs_type, type);
+
+ return sprintf(buf, "virtual display, %d MB video memory\n",
+ type ? type->mbytes : 0);
+}
+
+static unsigned int mbochs_get_available(struct mdev_type *mtype)
+{
+ struct mbochs_type *type =
+ container_of(mtype, struct mbochs_type, type);
+
+ return atomic_read(&mbochs_avail_mbytes) / type->mbytes;
+}
+
+static const struct vfio_device_ops mbochs_dev_ops = {
+ .close_device = mbochs_close_device,
+ .init = mbochs_init_dev,
+ .release = mbochs_release_dev,
+ .read = mbochs_read,
+ .write = mbochs_write,
+ .ioctl = mbochs_ioctl,
+ .get_region_info_caps = mbochs_ioctl_get_region_info,
+ .mmap = mbochs_mmap,
+ .bind_iommufd = vfio_iommufd_emulated_bind,
+ .unbind_iommufd = vfio_iommufd_emulated_unbind,
+ .attach_ioas = vfio_iommufd_emulated_attach_ioas,
+ .detach_ioas = vfio_iommufd_emulated_detach_ioas,
+};
+
+static struct mdev_driver mbochs_driver = {
+ .device_api = VFIO_DEVICE_API_PCI_STRING,
+ .driver = {
+ .name = "mbochs",
+ .owner = THIS_MODULE,
+ .mod_name = KBUILD_MODNAME,
+ .dev_groups = mdev_dev_groups,
+ },
+ .probe = mbochs_probe,
+ .remove = mbochs_remove,
+ .get_available = mbochs_get_available,
+ .show_description = mbochs_show_description,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mbochs_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mbochs_dev_init(void)
+{
+ int ret = 0;
+
+ atomic_set(&mbochs_avail_mbytes, max_mbytes);
+
+ ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mbochs_cdev, &vd_fops);
+ cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1);
+ pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
+
+ ret = mdev_register_driver(&mbochs_driver);
+ if (ret)
+ goto err_cdev;
+
+ ret = class_register(&mbochs_class);
+ if (ret)
+ goto err_driver;
+ mbochs_dev.class = &mbochs_class;
+ mbochs_dev.release = mbochs_device_release;
+ dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
+
+ ret = device_register(&mbochs_dev);
+ if (ret)
+ goto err_put;
+
+ ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver,
+ mbochs_mdev_types,
+ ARRAY_SIZE(mbochs_mdev_types));
+ if (ret)
+ goto err_device;
+
+ return 0;
+
+err_device:
+ device_del(&mbochs_dev);
+err_put:
+ put_device(&mbochs_dev);
+ class_unregister(&mbochs_class);
+err_driver:
+ mdev_unregister_driver(&mbochs_driver);
+err_cdev:
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
+ return ret;
+}
+
+static void __exit mbochs_dev_exit(void)
+{
+ mbochs_dev.bus = NULL;
+ mdev_unregister_parent(&mbochs_parent);
+
+ device_unregister(&mbochs_dev);
+ mdev_unregister_driver(&mbochs_driver);
+ cdev_del(&mbochs_cdev);
+ unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
+ class_unregister(&mbochs_class);
+}
+
+MODULE_IMPORT_NS("DMA_BUF");
+module_init(mbochs_dev_init)
+module_exit(mbochs_dev_exit)
diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
new file mode 100644
index 000000000000..961c55ec3ffd
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-defs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Simple pci display device.
+ *
+ * Framebuffer memory is pci bar 0.
+ * Configuration (read-only) is in pci config space.
+ * Format field uses drm fourcc codes.
+ * ATM only DRM_FORMAT_XRGB8888 is supported.
+ */
+
+/* pci ids */
+#define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT
+#define MDPY_PCI_DEVICE_ID 0x000f
+#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET
+#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU
+
+/* pci cfg space offsets for fb config (dword) */
+#define MDPY_VENDORCAP_OFFSET 0x40
+#define MDPY_VENDORCAP_SIZE 0x10
+#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04)
+#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08)
+#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c)
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
new file mode 100644
index 000000000000..149af7f598f8
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Framebuffer driver for mdpy (mediated virtual pci display device).
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * Using some code snippets from simplefb and cirrusfb.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+static const struct fb_fix_screeninfo mdpy_fb_fix = {
+ .id = "mdpy-fb",
+ .type = FB_TYPE_PACKED_PIXELS,
+ .visual = FB_VISUAL_TRUECOLOR,
+ .accel = FB_ACCEL_NONE,
+};
+
+static const struct fb_var_screeninfo mdpy_fb_var = {
+ .height = -1,
+ .width = -1,
+ .activate = FB_ACTIVATE_NOW,
+ .vmode = FB_VMODE_NONINTERLACED,
+
+ .bits_per_pixel = 32,
+ .transp.offset = 24,
+ .red.offset = 16,
+ .green.offset = 8,
+ .blue.offset = 0,
+ .transp.length = 8,
+ .red.length = 8,
+ .green.length = 8,
+ .blue.length = 8,
+};
+
+#define PSEUDO_PALETTE_SIZE 16
+
+struct mdpy_fb_par {
+ u32 palette[PSEUDO_PALETTE_SIZE];
+};
+
+static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *info)
+{
+ u32 *pal = info->pseudo_palette;
+ u32 cr = red >> (16 - info->var.red.length);
+ u32 cg = green >> (16 - info->var.green.length);
+ u32 cb = blue >> (16 - info->var.blue.length);
+ u32 value, mask;
+
+ if (regno >= PSEUDO_PALETTE_SIZE)
+ return -EINVAL;
+
+ value = (cr << info->var.red.offset) |
+ (cg << info->var.green.offset) |
+ (cb << info->var.blue.offset);
+ if (info->var.transp.length > 0) {
+ mask = (1 << info->var.transp.length) - 1;
+ mask <<= info->var.transp.offset;
+ value |= mask;
+ }
+ pal[regno] = value;
+
+ return 0;
+}
+
+static void mdpy_fb_destroy(struct fb_info *info)
+{
+ if (info->screen_base)
+ iounmap(info->screen_base);
+}
+
+static const struct fb_ops mdpy_fb_ops = {
+ .owner = THIS_MODULE,
+ FB_DEFAULT_IOMEM_OPS,
+ .fb_destroy = mdpy_fb_destroy,
+ .fb_setcolreg = mdpy_fb_setcolreg,
+};
+
+static int mdpy_fb_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct fb_info *info;
+ struct mdpy_fb_par *par;
+ u32 format, width, height;
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0)
+ return ret;
+
+ ret = pci_request_regions(pdev, "mdpy-fb");
+ if (ret < 0)
+ goto err_disable_dev;
+
+ pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
+ pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width);
+ pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
+ if (format != DRM_FORMAT_XRGB8888) {
+ pci_err(pdev, "format mismatch (0x%x != 0x%x)\n",
+ format, DRM_FORMAT_XRGB8888);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ if (width < 100 || width > 10000) {
+ pci_err(pdev, "width (%d) out of range\n", width);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ if (height < 100 || height > 10000) {
+ pci_err(pdev, "height (%d) out of range\n", height);
+ ret = -EINVAL;
+ goto err_release_regions;
+ }
+ pci_info(pdev, "mdpy found: %dx%d framebuffer\n",
+ width, height);
+
+ info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
+ if (!info) {
+ ret = -ENOMEM;
+ goto err_release_regions;
+ }
+ pci_set_drvdata(pdev, info);
+ par = info->par;
+
+ info->fix = mdpy_fb_fix;
+ info->fix.smem_start = pci_resource_start(pdev, 0);
+ info->fix.smem_len = pci_resource_len(pdev, 0);
+ info->fix.line_length = width * 4;
+
+ info->var = mdpy_fb_var;
+ info->var.xres = width;
+ info->var.yres = height;
+ info->var.xres_virtual = width;
+ info->var.yres_virtual = height;
+
+ info->screen_size = info->fix.smem_len;
+ info->screen_base = ioremap(info->fix.smem_start,
+ info->screen_size);
+ if (!info->screen_base) {
+ pci_err(pdev, "ioremap(pcibar) failed\n");
+ ret = -EIO;
+ goto err_release_fb;
+ }
+
+ info->fbops = &mdpy_fb_ops;
+ info->pseudo_palette = par->palette;
+
+ ret = register_framebuffer(info);
+ if (ret < 0) {
+ pci_err(pdev, "mdpy-fb device register failed: %d\n", ret);
+ goto err_unmap;
+ }
+
+ pci_info(pdev, "fb%d registered\n", info->node);
+ return 0;
+
+err_unmap:
+ iounmap(info->screen_base);
+
+err_release_fb:
+ framebuffer_release(info);
+
+err_release_regions:
+ pci_release_regions(pdev);
+
+err_disable_dev:
+ pci_disable_device(pdev);
+
+ return ret;
+}
+
+static void mdpy_fb_remove(struct pci_dev *pdev)
+{
+ struct fb_info *info = pci_get_drvdata(pdev);
+
+ unregister_framebuffer(info);
+ iounmap(info->screen_base);
+ framebuffer_release(info);
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
+}
+
+static struct pci_device_id mdpy_fb_pci_table[] = {
+ {
+ .vendor = MDPY_PCI_VENDOR_ID,
+ .device = MDPY_PCI_DEVICE_ID,
+ .subvendor = MDPY_PCI_SUBVENDOR_ID,
+ .subdevice = MDPY_PCI_SUBDEVICE_ID,
+ }, {
+ /* end of list */
+ }
+};
+
+static struct pci_driver mdpy_fb_pci_driver = {
+ .name = "mdpy-fb",
+ .id_table = mdpy_fb_pci_table,
+ .probe = mdpy_fb_probe,
+ .remove = mdpy_fb_remove,
+};
+
+static int __init mdpy_fb_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&mdpy_fb_pci_driver);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+module_init(mdpy_fb_init);
+
+MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
+MODULE_DESCRIPTION("Framebuffer driver for mdpy (mediated virtual pci display device)");
+MODULE_LICENSE("GPL v2");
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
new file mode 100644
index 000000000000..0759bd68edca
--- /dev/null
+++ b/samples/vfio-mdev/mdpy.c
@@ -0,0 +1,743 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Mediated virtual PCI display host device driver
+ *
+ * See mdpy-defs.h for device specs
+ *
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * based on mtty driver which is:
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/cdev.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <drm/drm_fourcc.h>
+#include "mdpy-defs.h"
+
+#define MDPY_NAME "mdpy"
+#define MDPY_CLASS_NAME "mdpy"
+
+#define MDPY_CONFIG_SPACE_SIZE 0xff
+#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE
+#define MDPY_DISPLAY_REGION 16
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+
+MODULE_DESCRIPTION("Mediated virtual PCI display host device driver");
+MODULE_LICENSE("GPL v2");
+
+#define MDPY_TYPE_1 "vga"
+#define MDPY_TYPE_2 "xga"
+#define MDPY_TYPE_3 "hd"
+
+static struct mdpy_type {
+ struct mdev_type type;
+ u32 format;
+ u32 bytepp;
+ u32 width;
+ u32 height;
+} mdpy_types[] = {
+ {
+ .type.sysfs_name = MDPY_TYPE_1,
+ .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_1,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 640,
+ .height = 480,
+ }, {
+ .type.sysfs_name = MDPY_TYPE_2,
+ .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_2,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1024,
+ .height = 768,
+ }, {
+ .type.sysfs_name = MDPY_TYPE_3,
+ .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_3,
+ .format = DRM_FORMAT_XRGB8888,
+ .bytepp = 4,
+ .width = 1920,
+ .height = 1080,
+ },
+};
+
+static struct mdev_type *mdpy_mdev_types[] = {
+ &mdpy_types[0].type,
+ &mdpy_types[1].type,
+ &mdpy_types[2].type,
+};
+
+static dev_t mdpy_devt;
+static const struct class mdpy_class = {
+ .name = MDPY_CLASS_NAME,
+};
+static struct cdev mdpy_cdev;
+static struct device mdpy_dev;
+static struct mdev_parent mdpy_parent;
+static const struct vfio_device_ops mdpy_dev_ops;
+
+/* State of each mdev device */
+struct mdev_state {
+ struct vfio_device vdev;
+ u8 *vconfig;
+ u32 bar_mask;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct vfio_device_info dev_info;
+
+ const struct mdpy_type *type;
+ u32 memsize;
+ void *memblk;
+};
+
+static void mdpy_create_config_space(struct mdev_state *mdev_state)
+{
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
+ MDPY_PCI_VENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
+ MDPY_PCI_DEVICE_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
+ MDPY_PCI_SUBVENDOR_ID);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
+ MDPY_PCI_SUBDEVICE_ID);
+
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
+ PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS],
+ PCI_STATUS_CAP_LIST);
+ STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
+ PCI_CLASS_DISPLAY_OTHER);
+ mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
+
+ STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH);
+ mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
+
+ /* vendor specific capability for the config registers */
+ mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET;
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */
+ mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE;
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
+ mdev_state->type->format);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
+ mdev_state->type->width);
+ STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
+ mdev_state->type->height);
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ char *buf, u32 count)
+{
+ struct device *dev = mdev_dev(mdev_state->mdev);
+ u32 cfg_addr;
+
+ switch (offset) {
+ case PCI_BASE_ADDRESS_0:
+ cfg_addr = *(u32 *)buf;
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = (cfg_addr & mdev_state->bar_mask);
+ } else {
+ cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
+ if (cfg_addr)
+ dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] &
+ ~PCI_BASE_ADDRESS_MEM_MASK);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_state *mdev_state, char *buf,
+ size_t count, loff_t pos, bool is_write)
+{
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ if (pos < MDPY_CONFIG_SPACE_SIZE) {
+ if (is_write)
+ handle_pci_cfg_write(mdev_state, pos, buf, count);
+ else
+ memcpy(buf, (mdev_state->vconfig + pos), count);
+
+ } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) &&
+ (pos + count <=
+ MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) {
+ pos -= MDPY_MEMORY_BAR_OFFSET;
+ if (is_write)
+ memcpy(mdev_state->memblk, buf, count);
+ else
+ memcpy(buf, mdev_state->memblk, count);
+
+ } else {
+ dev_info(mdev_state->vdev.dev,
+ "%s: %s @0x%llx (unhandled)\n", __func__,
+ is_write ? "WR" : "RD", pos);
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static int mdpy_reset(struct mdev_state *mdev_state)
+{
+ u32 stride, i;
+
+ /* initialize with gray gradient */
+ stride = mdev_state->type->width * mdev_state->type->bytepp;
+ for (i = 0; i < mdev_state->type->height; i++)
+ memset(mdev_state->memblk + i * stride,
+ i * 255 / mdev_state->type->height,
+ stride);
+ return 0;
+}
+
+static int mdpy_init_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct mdev_device *mdev = to_mdev_device(vdev->dev);
+ const struct mdpy_type *type =
+ container_of(mdev->type, struct mdpy_type, type);
+ u32 fbsize;
+ int ret = -ENOMEM;
+
+ mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (!mdev_state->vconfig)
+ return ret;
+
+ fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
+
+ mdev_state->memblk = vmalloc_user(fbsize);
+ if (!mdev_state->memblk)
+ goto out_vconfig;
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mdev_state->type = type;
+ mdev_state->memsize = fbsize;
+ mdpy_create_config_space(mdev_state);
+ mdpy_reset(mdev_state);
+
+ dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name,
+ type->width, type->height);
+ return 0;
+
+out_vconfig:
+ kfree(mdev_state->vconfig);
+ return ret;
+}
+
+static int mdpy_probe(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state;
+ int ret;
+
+ mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev,
+ &mdpy_dev_ops);
+ if (IS_ERR(mdev_state))
+ return PTR_ERR(mdev_state);
+
+ ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev);
+ if (ret)
+ goto err_put_vdev;
+ dev_set_drvdata(&mdev->dev, mdev_state);
+ return 0;
+
+err_put_vdev:
+ vfio_put_device(&mdev_state->vdev);
+ return ret;
+}
+
+static void mdpy_release_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ vfree(mdev_state->memblk);
+ kfree(mdev_state->vconfig);
+}
+
+static void mdpy_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
+
+ dev_info(&mdev->dev, "%s\n", __func__);
+
+ vfio_unregister_group_dev(&mdev_state->vdev);
+ vfio_put_device(&mdev_state->vdev);
+}
+
+static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mdpy_write(struct vfio_device *vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (char *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
+ return -EINVAL;
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_end - vma->vm_start > mdev_state->memsize)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, mdev_state->memblk, 0);
+}
+
+static int mdpy_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
+ region_info->index != MDPY_DISPLAY_REGION)
+ return -EINVAL;
+
+ switch (region_info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ region_info->offset = 0;
+ region_info->size = MDPY_CONFIG_SPACE_SIZE;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE);
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ case MDPY_DISPLAY_REGION:
+ region_info->offset = MDPY_MEMORY_BAR_OFFSET;
+ region_info->size = mdev_state->memsize;
+ region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP);
+ break;
+ default:
+ region_info->size = 0;
+ region_info->offset = 0;
+ region_info->flags = 0;
+ }
+
+ return 0;
+}
+
+static int mdpy_get_irq_info(struct vfio_irq_info *irq_info)
+{
+ irq_info->count = 0;
+ return 0;
+}
+
+static int mdpy_get_device_info(struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+ return 0;
+}
+
+static int mdpy_query_gfx_plane(struct mdev_state *mdev_state,
+ struct vfio_device_gfx_plane_info *plane)
+{
+ if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
+ if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
+ VFIO_GFX_PLANE_TYPE_REGION))
+ return 0;
+ return -EINVAL;
+ }
+
+ if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
+ return -EINVAL;
+
+ plane->drm_format = mdev_state->type->format;
+ plane->width = mdev_state->type->width;
+ plane->height = mdev_state->type->height;
+ plane->stride = (mdev_state->type->width *
+ mdev_state->type->bytepp);
+ plane->size = mdev_state->memsize;
+ plane->region_index = MDPY_DISPLAY_REGION;
+
+ /* unused */
+ plane->drm_format_mod = 0;
+ plane->x_pos = 0;
+ plane->y_pos = 0;
+ plane->x_hot = 0;
+ plane->y_hot = 0;
+
+ return 0;
+}
+
+static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = 0;
+ unsigned long minsz;
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_get_device_info(&info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mdpy_get_irq_info(&info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_QUERY_GFX_PLANE:
+ {
+ struct vfio_device_gfx_plane_info plane = {};
+
+ minsz = offsetofend(struct vfio_device_gfx_plane_info,
+ region_index);
+
+ if (copy_from_user(&plane, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (plane.argsz < minsz)
+ return -EINVAL;
+
+ ret = mdpy_query_gfx_plane(mdev_state, &plane);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &plane, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_SET_IRQS:
+ return -EINVAL;
+
+ case VFIO_DEVICE_RESET:
+ return mdpy_reset(mdev_state);
+ }
+ return -ENOTTY;
+}
+
+static ssize_t
+resolution_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct mdev_state *mdev_state = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%dx%d\n",
+ mdev_state->type->width,
+ mdev_state->type->height);
+}
+static DEVICE_ATTR_RO(resolution);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_resolution.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+static const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf)
+{
+ struct mdpy_type *type = container_of(mtype, struct mdpy_type, type);
+
+ return sprintf(buf, "virtual display, %dx%d framebuffer\n",
+ type->width, type->height);
+}
+
+static const struct vfio_device_ops mdpy_dev_ops = {
+ .init = mdpy_init_dev,
+ .release = mdpy_release_dev,
+ .read = mdpy_read,
+ .write = mdpy_write,
+ .ioctl = mdpy_ioctl,
+ .get_region_info_caps = mdpy_ioctl_get_region_info,
+ .mmap = mdpy_mmap,
+ .bind_iommufd = vfio_iommufd_emulated_bind,
+ .unbind_iommufd = vfio_iommufd_emulated_unbind,
+ .attach_ioas = vfio_iommufd_emulated_attach_ioas,
+ .detach_ioas = vfio_iommufd_emulated_detach_ioas,
+};
+
+static struct mdev_driver mdpy_driver = {
+ .device_api = VFIO_DEVICE_API_PCI_STRING,
+ .max_instances = 4,
+ .driver = {
+ .name = "mdpy",
+ .owner = THIS_MODULE,
+ .mod_name = KBUILD_MODNAME,
+ .dev_groups = mdev_dev_groups,
+ },
+ .probe = mdpy_probe,
+ .remove = mdpy_remove,
+ .show_description = mdpy_show_description,
+};
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static void mdpy_device_release(struct device *dev)
+{
+ /* nothing */
+}
+
+static int __init mdpy_dev_init(void)
+{
+ int ret = 0;
+
+ ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK + 1, MDPY_NAME);
+ if (ret < 0) {
+ pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
+ return ret;
+ }
+ cdev_init(&mdpy_cdev, &vd_fops);
+ cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1);
+ pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
+
+ ret = mdev_register_driver(&mdpy_driver);
+ if (ret)
+ goto err_cdev;
+
+ ret = class_register(&mdpy_class);
+ if (ret)
+ goto err_driver;
+ mdpy_dev.class = &mdpy_class;
+ mdpy_dev.release = mdpy_device_release;
+ dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
+
+ ret = device_register(&mdpy_dev);
+ if (ret)
+ goto err_put;
+
+ ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver,
+ mdpy_mdev_types,
+ ARRAY_SIZE(mdpy_mdev_types));
+ if (ret)
+ goto err_device;
+
+ return 0;
+
+err_device:
+ device_del(&mdpy_dev);
+err_put:
+ put_device(&mdpy_dev);
+ class_unregister(&mdpy_class);
+err_driver:
+ mdev_unregister_driver(&mdpy_driver);
+err_cdev:
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
+ return ret;
+}
+
+static void __exit mdpy_dev_exit(void)
+{
+ mdpy_dev.bus = NULL;
+ mdev_unregister_parent(&mdpy_parent);
+
+ device_unregister(&mdpy_dev);
+ mdev_unregister_driver(&mdpy_driver);
+ cdev_del(&mdpy_cdev);
+ unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
+ class_unregister(&mdpy_class);
+}
+
+module_param_named(count, mdpy_driver.max_instances, int, 0444);
+MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
+
+module_init(mdpy_dev_init)
+module_exit(mdpy_dev_exit)
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
new file mode 100644
index 000000000000..bd92c38379b8
--- /dev/null
+++ b/samples/vfio-mdev/mtty.c
@@ -0,0 +1,2040 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Mediated virtual PCI serial host device driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Author: Neo Jia <cjia@nvidia.com>
+ * Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * Sample driver that creates mdev device that simulates serial port over PCI
+ * card.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/mdev.h>
+#include <linux/pci.h>
+#include <linux/serial.h>
+#include <uapi/linux/serial_reg.h>
+#include <linux/eventfd.h>
+#include <linux/anon_inodes.h>
+
+/*
+ * #defines
+ */
+
+#define VERSION_STRING "0.1"
+#define DRIVER_AUTHOR "NVIDIA Corporation"
+
+#define MTTY_CLASS_NAME "mtty"
+
+#define MTTY_NAME "mtty"
+
+#define MTTY_STRING_LEN 16
+
+#define MTTY_CONFIG_SPACE_SIZE 0xff
+#define MTTY_IO_BAR_SIZE 0x8
+#define MTTY_MMIO_BAR_SIZE 0x100000
+
+#define STORE_LE16(addr, val) (*(u16 *)addr = val)
+#define STORE_LE32(addr, val) (*(u32 *)addr = val)
+
+#define MAX_FIFO_SIZE 16
+
+#define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1))
+
+#define MTTY_VFIO_PCI_OFFSET_SHIFT 40
+
+#define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT)
+#define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \
+ ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT)
+#define MTTY_VFIO_PCI_OFFSET_MASK \
+ (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1)
+#define MAX_MTTYS 24
+
+/*
+ * Global Structures
+ */
+
+static struct mtty_dev {
+ dev_t vd_devt;
+ struct class *vd_class;
+ struct cdev vd_cdev;
+ struct idr vd_idr;
+ struct device dev;
+ struct mdev_parent parent;
+} mtty_dev;
+
+struct mdev_region_info {
+ u64 start;
+ u64 phys_start;
+ u32 size;
+ u64 vfio_offset;
+};
+
+#if defined(DEBUG_REGS)
+static const char *wr_reg[] = {
+ "TX",
+ "IER",
+ "FCR",
+ "LCR",
+ "MCR",
+ "LSR",
+ "MSR",
+ "SCR"
+};
+
+static const char *rd_reg[] = {
+ "RX",
+ "IER",
+ "IIR",
+ "LCR",
+ "MCR",
+ "LSR",
+ "MSR",
+ "SCR"
+};
+#endif
+
+/* loop back buffer */
+struct rxtx {
+ u8 fifo[MAX_FIFO_SIZE];
+ u8 head, tail;
+ u8 count;
+};
+
+struct serial_port {
+ u8 uart_reg[8]; /* 8 registers */
+ struct rxtx rxtx; /* loop back buffer */
+ bool dlab;
+ bool overrun;
+ u16 divisor;
+ u8 fcr; /* FIFO control register */
+ u8 max_fifo_size;
+ u8 intr_trigger_level; /* interrupt trigger level */
+};
+
+struct mtty_data {
+ u64 magic;
+#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */
+ u32 major_ver;
+#define MTTY_MAJOR_VER 1
+ u32 minor_ver;
+#define MTTY_MINOR_VER 0
+ u32 nr_ports;
+ u32 flags;
+ struct serial_port ports[2];
+};
+
+struct mdev_state;
+
+struct mtty_migration_file {
+ struct file *filp;
+ struct mutex lock;
+ struct mdev_state *mdev_state;
+ struct mtty_data data;
+ ssize_t filled_size;
+ u8 disabled:1;
+};
+
+/* State of each mdev device */
+struct mdev_state {
+ struct vfio_device vdev;
+ struct eventfd_ctx *intx_evtfd;
+ struct eventfd_ctx *msi_evtfd;
+ int irq_index;
+ u8 *vconfig;
+ struct mutex ops_lock;
+ struct mdev_device *mdev;
+ struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
+ u32 bar_mask[VFIO_PCI_NUM_REGIONS];
+ struct list_head next;
+ struct serial_port s[2];
+ struct mutex rxtx_lock;
+ struct vfio_device_info dev_info;
+ int nr_ports;
+ enum vfio_device_mig_state state;
+ struct mutex state_mutex;
+ struct mutex reset_mutex;
+ struct mtty_migration_file *saving_migf;
+ struct mtty_migration_file *resuming_migf;
+ u8 deferred_reset:1;
+ u8 intx_mask:1;
+};
+
+static struct mtty_type {
+ struct mdev_type type;
+ int nr_ports;
+} mtty_types[2] = {
+ { .nr_ports = 1, .type.sysfs_name = "1",
+ .type.pretty_name = "Single port serial" },
+ { .nr_ports = 2, .type.sysfs_name = "2",
+ .type.pretty_name = "Dual port serial" },
+};
+
+static struct mdev_type *mtty_mdev_types[] = {
+ &mtty_types[0].type,
+ &mtty_types[1].type,
+};
+
+static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS);
+
+static const struct file_operations vd_fops = {
+ .owner = THIS_MODULE,
+};
+
+static const struct vfio_device_ops mtty_dev_ops;
+
+/* Helper functions */
+
+static void dump_buffer(u8 *buf, uint32_t count)
+{
+#if defined(DEBUG)
+ int i;
+
+ pr_info("Buffer:\n");
+ for (i = 0; i < count; i++) {
+ pr_info("%2x ", *(buf + i));
+ if ((i + 1) % 16 == 0)
+ pr_info("\n");
+ }
+#endif
+}
+
+static bool is_intx(struct mdev_state *mdev_state)
+{
+ return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX;
+}
+
+static bool is_msi(struct mdev_state *mdev_state)
+{
+ return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX;
+}
+
+static bool is_noirq(struct mdev_state *mdev_state)
+{
+ return !is_intx(mdev_state) && !is_msi(mdev_state);
+}
+
+static void mtty_trigger_interrupt(struct mdev_state *mdev_state)
+{
+ lockdep_assert_held(&mdev_state->ops_lock);
+
+ if (is_msi(mdev_state)) {
+ if (mdev_state->msi_evtfd)
+ eventfd_signal(mdev_state->msi_evtfd);
+ } else if (is_intx(mdev_state)) {
+ if (mdev_state->intx_evtfd && !mdev_state->intx_mask) {
+ eventfd_signal(mdev_state->intx_evtfd);
+ mdev_state->intx_mask = true;
+ }
+ }
+}
+
+static void mtty_create_config_space(struct mdev_state *mdev_state)
+{
+ /* PCI dev ID */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348);
+
+ /* Control: I/O+, Mem-, BusMaster- */
+ STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001);
+
+ /* Status: capabilities list absent */
+ STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200);
+
+ /* Rev ID */
+ mdev_state->vconfig[0x8] = 0x10;
+
+ /* programming interface class : 16550-compatible serial controller */
+ mdev_state->vconfig[0x9] = 0x02;
+
+ /* Sub class : 00 */
+ mdev_state->vconfig[0xa] = 0x00;
+
+ /* Base class : Simple Communication controllers */
+ mdev_state->vconfig[0xb] = 0x07;
+
+ /* base address registers */
+ /* BAR0: IO space */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001);
+ mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1;
+
+ if (mdev_state->nr_ports == 2) {
+ /* BAR1: IO space */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001);
+ mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1;
+ }
+
+ /* Subsystem ID */
+ STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348);
+
+ mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */
+ mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */
+
+ /* Vendor specific data */
+ mdev_state->vconfig[0x40] = 0x23;
+ mdev_state->vconfig[0x43] = 0x80;
+ mdev_state->vconfig[0x44] = 0x23;
+ mdev_state->vconfig[0x48] = 0x23;
+ mdev_state->vconfig[0x4c] = 0x23;
+
+ mdev_state->vconfig[0x60] = 0x50;
+ mdev_state->vconfig[0x61] = 0x43;
+ mdev_state->vconfig[0x62] = 0x49;
+ mdev_state->vconfig[0x63] = 0x20;
+ mdev_state->vconfig[0x64] = 0x53;
+ mdev_state->vconfig[0x65] = 0x65;
+ mdev_state->vconfig[0x66] = 0x72;
+ mdev_state->vconfig[0x67] = 0x69;
+ mdev_state->vconfig[0x68] = 0x61;
+ mdev_state->vconfig[0x69] = 0x6c;
+ mdev_state->vconfig[0x6a] = 0x2f;
+ mdev_state->vconfig[0x6b] = 0x55;
+ mdev_state->vconfig[0x6c] = 0x41;
+ mdev_state->vconfig[0x6d] = 0x52;
+ mdev_state->vconfig[0x6e] = 0x54;
+}
+
+static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
+ u8 *buf, u32 count)
+{
+ u32 cfg_addr, bar_mask, bar_index = 0;
+
+ switch (offset) {
+ case 0x04: /* device control */
+ case 0x06: /* device status */
+ /* do nothing */
+ break;
+ case 0x3c: /* interrupt line */
+ mdev_state->vconfig[0x3c] = buf[0];
+ break;
+ case 0x3d:
+ /*
+ * Interrupt Pin is hardwired to INTA.
+ * This field is write protected by hardware
+ */
+ break;
+ case 0x10: /* BAR0 */
+ case 0x14: /* BAR1 */
+ if (offset == 0x10)
+ bar_index = 0;
+ else if (offset == 0x14)
+ bar_index = 1;
+
+ if ((mdev_state->nr_ports == 1) && (bar_index == 1)) {
+ STORE_LE32(&mdev_state->vconfig[offset], 0);
+ break;
+ }
+
+ cfg_addr = *(u32 *)buf;
+ pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr);
+
+ if (cfg_addr == 0xffffffff) {
+ bar_mask = mdev_state->bar_mask[bar_index];
+ cfg_addr = (cfg_addr & bar_mask);
+ }
+
+ cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul);
+ STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
+ break;
+ case 0x18: /* BAR2 */
+ case 0x1c: /* BAR3 */
+ case 0x20: /* BAR4 */
+ STORE_LE32(&mdev_state->vconfig[offset], 0);
+ break;
+ default:
+ pr_info("PCI config write @0x%x of %d bytes not handled\n",
+ offset, count);
+ break;
+ }
+}
+
+static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state,
+ u16 offset, u8 *buf, u32 count)
+{
+ u8 data = *buf;
+
+ /* Handle data written by guest */
+ switch (offset) {
+ case UART_TX:
+ /* if DLAB set, data is LSB of divisor */
+ if (mdev_state->s[index].dlab) {
+ mdev_state->s[index].divisor |= data;
+ break;
+ }
+
+ mutex_lock(&mdev_state->rxtx_lock);
+
+ /* save in TX buffer */
+ if (mdev_state->s[index].rxtx.count <
+ mdev_state->s[index].max_fifo_size) {
+ mdev_state->s[index].rxtx.fifo[
+ mdev_state->s[index].rxtx.head] = data;
+ mdev_state->s[index].rxtx.count++;
+ CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head);
+ mdev_state->s[index].overrun = false;
+
+ /*
+ * Trigger interrupt if receive data interrupt is
+ * enabled and fifo reached trigger level
+ */
+ if ((mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_RDI) &&
+ (mdev_state->s[index].rxtx.count ==
+ mdev_state->s[index].intr_trigger_level)) {
+ /* trigger interrupt */
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Fifo level trigger\n",
+ index);
+#endif
+ mtty_trigger_interrupt(mdev_state);
+ }
+ } else {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Buffer Overflow\n", index);
+#endif
+ mdev_state->s[index].overrun = true;
+
+ /*
+ * Trigger interrupt if receiver line status interrupt
+ * is enabled
+ */
+ if (mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_RLSI)
+ mtty_trigger_interrupt(mdev_state);
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+ break;
+
+ case UART_IER:
+ /* if DLAB set, data is MSB of divisor */
+ if (mdev_state->s[index].dlab)
+ mdev_state->s[index].divisor |= (u16)data << 8;
+ else {
+ mdev_state->s[index].uart_reg[offset] = data;
+ mutex_lock(&mdev_state->rxtx_lock);
+ if ((data & UART_IER_THRI) &&
+ (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail)) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: IER_THRI write\n",
+ index);
+#endif
+ mtty_trigger_interrupt(mdev_state);
+ }
+
+ mutex_unlock(&mdev_state->rxtx_lock);
+ }
+
+ break;
+
+ case UART_FCR:
+ mdev_state->s[index].fcr = data;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) {
+ /* clear loop back FIFO */
+ mdev_state->s[index].rxtx.count = 0;
+ mdev_state->s[index].rxtx.head = 0;
+ mdev_state->s[index].rxtx.tail = 0;
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ switch (data & UART_FCR_TRIGGER_MASK) {
+ case UART_FCR_TRIGGER_1:
+ mdev_state->s[index].intr_trigger_level = 1;
+ break;
+
+ case UART_FCR_TRIGGER_4:
+ mdev_state->s[index].intr_trigger_level = 4;
+ break;
+
+ case UART_FCR_TRIGGER_8:
+ mdev_state->s[index].intr_trigger_level = 8;
+ break;
+
+ case UART_FCR_TRIGGER_14:
+ mdev_state->s[index].intr_trigger_level = 14;
+ break;
+ }
+
+ /*
+ * Set trigger level to 1 otherwise or implement timer with
+ * timeout of 4 characters and on expiring that timer set
+ * Recevice data timeout in IIR register
+ */
+ mdev_state->s[index].intr_trigger_level = 1;
+ if (data & UART_FCR_ENABLE_FIFO)
+ mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE;
+ else {
+ mdev_state->s[index].max_fifo_size = 1;
+ mdev_state->s[index].intr_trigger_level = 1;
+ }
+
+ break;
+
+ case UART_LCR:
+ if (data & UART_LCR_DLAB) {
+ mdev_state->s[index].dlab = true;
+ mdev_state->s[index].divisor = 0;
+ } else
+ mdev_state->s[index].dlab = false;
+
+ mdev_state->s[index].uart_reg[offset] = data;
+ break;
+
+ case UART_MCR:
+ mdev_state->s[index].uart_reg[offset] = data;
+
+ if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
+ (data & UART_MCR_OUT2)) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: MCR_OUT2 write\n", index);
+#endif
+ mtty_trigger_interrupt(mdev_state);
+ }
+
+ if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
+ (data & (UART_MCR_RTS | UART_MCR_DTR))) {
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: MCR RTS/DTR write\n", index);
+#endif
+ mtty_trigger_interrupt(mdev_state);
+ }
+ break;
+
+ case UART_LSR:
+ case UART_MSR:
+ /* do nothing */
+ break;
+
+ case UART_SCR:
+ mdev_state->s[index].uart_reg[offset] = data;
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state,
+ u16 offset, u8 *buf, u32 count)
+{
+ /* Handle read requests by guest */
+ switch (offset) {
+ case UART_RX:
+ /* if DLAB set, data is LSB of divisor */
+ if (mdev_state->s[index].dlab) {
+ *buf = (u8)mdev_state->s[index].divisor;
+ break;
+ }
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* return data in tx buffer */
+ if (mdev_state->s[index].rxtx.head !=
+ mdev_state->s[index].rxtx.tail) {
+ *buf = mdev_state->s[index].rxtx.fifo[
+ mdev_state->s[index].rxtx.tail];
+ mdev_state->s[index].rxtx.count--;
+ CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail);
+ }
+
+ if (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail) {
+ /*
+ * Trigger interrupt if tx buffer empty interrupt is
+ * enabled and fifo is empty
+ */
+#if defined(DEBUG_INTR)
+ pr_err("Serial port %d: Buffer Empty\n", index);
+#endif
+ if (mdev_state->s[index].uart_reg[UART_IER] &
+ UART_IER_THRI)
+ mtty_trigger_interrupt(mdev_state);
+ }
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ break;
+
+ case UART_IER:
+ if (mdev_state->s[index].dlab) {
+ *buf = (u8)(mdev_state->s[index].divisor >> 8);
+ break;
+ }
+ *buf = mdev_state->s[index].uart_reg[offset] & 0x0f;
+ break;
+
+ case UART_IIR:
+ {
+ u8 ier = mdev_state->s[index].uart_reg[UART_IER];
+ *buf = 0;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* Interrupt priority 1: Parity, overrun, framing or break */
+ if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun)
+ *buf |= UART_IIR_RLSI;
+
+ /* Interrupt priority 2: Fifo trigger level reached */
+ if ((ier & UART_IER_RDI) &&
+ (mdev_state->s[index].rxtx.count >=
+ mdev_state->s[index].intr_trigger_level))
+ *buf |= UART_IIR_RDI;
+
+ /* Interrupt priotiry 3: transmitter holding register empty */
+ if ((ier & UART_IER_THRI) &&
+ (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail))
+ *buf |= UART_IIR_THRI;
+
+ /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */
+ if ((ier & UART_IER_MSI) &&
+ (mdev_state->s[index].uart_reg[UART_MCR] &
+ (UART_MCR_RTS | UART_MCR_DTR)))
+ *buf |= UART_IIR_MSI;
+
+ /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */
+ if (*buf == 0)
+ *buf = UART_IIR_NO_INT;
+
+ /* set bit 6 & 7 to be 16550 compatible */
+ *buf |= 0xC0;
+ mutex_unlock(&mdev_state->rxtx_lock);
+ }
+ break;
+
+ case UART_LCR:
+ case UART_MCR:
+ *buf = mdev_state->s[index].uart_reg[offset];
+ break;
+
+ case UART_LSR:
+ {
+ u8 lsr = 0;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* at least one char in FIFO */
+ if (mdev_state->s[index].rxtx.head !=
+ mdev_state->s[index].rxtx.tail)
+ lsr |= UART_LSR_DR;
+
+ /* if FIFO overrun */
+ if (mdev_state->s[index].overrun)
+ lsr |= UART_LSR_OE;
+
+ /* transmit FIFO empty and tramsitter empty */
+ if (mdev_state->s[index].rxtx.head ==
+ mdev_state->s[index].rxtx.tail)
+ lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+ mutex_unlock(&mdev_state->rxtx_lock);
+ *buf = lsr;
+ break;
+ }
+ case UART_MSR:
+ *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD;
+
+ mutex_lock(&mdev_state->rxtx_lock);
+ /* if AFE is 1 and FIFO have space, set CTS bit */
+ if (mdev_state->s[index].uart_reg[UART_MCR] &
+ UART_MCR_AFE) {
+ if (mdev_state->s[index].rxtx.count <
+ mdev_state->s[index].max_fifo_size)
+ *buf |= UART_MSR_CTS | UART_MSR_DCTS;
+ } else
+ *buf |= UART_MSR_CTS | UART_MSR_DCTS;
+ mutex_unlock(&mdev_state->rxtx_lock);
+
+ break;
+
+ case UART_SCR:
+ *buf = mdev_state->s[index].uart_reg[offset];
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void mdev_read_base(struct mdev_state *mdev_state)
+{
+ int index, pos;
+ u32 start_lo, start_hi;
+ u32 mem_type;
+
+ pos = PCI_BASE_ADDRESS_0;
+
+ for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
+
+ if (!mdev_state->region_info[index].size)
+ continue;
+
+ start_lo = (*(u32 *)(mdev_state->vconfig + pos)) &
+ PCI_BASE_ADDRESS_MEM_MASK;
+ mem_type = (*(u32 *)(mdev_state->vconfig + pos)) &
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+
+ switch (mem_type) {
+ case PCI_BASE_ADDRESS_MEM_TYPE_64:
+ start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4));
+ pos += 4;
+ break;
+ case PCI_BASE_ADDRESS_MEM_TYPE_32:
+ case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+ /* 1M mem BAR treated as 32-bit BAR */
+ default:
+ /* mem unknown type treated as 32-bit BAR */
+ start_hi = 0;
+ break;
+ }
+ pos += 4;
+ mdev_state->region_info[index].start = ((u64)start_hi << 32) |
+ start_lo;
+ }
+}
+
+static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ unsigned int index;
+ loff_t offset;
+ int ret = 0;
+
+ if (!buf)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos);
+ offset = pos & MTTY_VFIO_PCI_OFFSET_MASK;
+ switch (index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+
+#if defined(DEBUG)
+ pr_info("%s: PCI config space %s at offset 0x%llx\n",
+ __func__, is_write ? "write" : "read", offset);
+#endif
+ if (is_write) {
+ dump_buffer(buf, count);
+ handle_pci_cfg_write(mdev_state, offset, buf, count);
+ } else {
+ memcpy(buf, (mdev_state->vconfig + offset), count);
+ dump_buffer(buf, count);
+ }
+
+ break;
+
+ case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ if (!mdev_state->region_info[index].start)
+ mdev_read_base(mdev_state);
+
+ if (is_write) {
+ dump_buffer(buf, count);
+
+#if defined(DEBUG_REGS)
+ pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n",
+ __func__, index, offset, wr_reg[offset],
+ *buf, mdev_state->s[index].dlab);
+#endif
+ handle_bar_write(index, mdev_state, offset, buf, count);
+ } else {
+ handle_bar_read(index, mdev_state, offset, buf, count);
+ dump_buffer(buf, count);
+
+#if defined(DEBUG_REGS)
+ pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n",
+ __func__, index, offset, rd_reg[offset],
+ *buf, mdev_state->s[index].dlab);
+#endif
+ }
+ break;
+
+ default:
+ ret = -1;
+ goto accessfailed;
+ }
+
+ ret = count;
+
+
+accessfailed:
+ mutex_unlock(&mdev_state->ops_lock);
+
+ return ret;
+}
+
+static size_t mtty_data_size(struct mdev_state *mdev_state)
+{
+ return offsetof(struct mtty_data, ports) +
+ (mdev_state->nr_ports * sizeof(struct serial_port));
+}
+
+static void mtty_disable_file(struct mtty_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ migf->filled_size = 0;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void mtty_disable_files(struct mdev_state *mdev_state)
+{
+ if (mdev_state->saving_migf) {
+ mtty_disable_file(mdev_state->saving_migf);
+ fput(mdev_state->saving_migf->filp);
+ mdev_state->saving_migf = NULL;
+ }
+
+ if (mdev_state->resuming_migf) {
+ mtty_disable_file(mdev_state->resuming_migf);
+ fput(mdev_state->resuming_migf->filp);
+ mdev_state->resuming_migf = NULL;
+ }
+}
+
+static void mtty_state_mutex_unlock(struct mdev_state *mdev_state)
+{
+again:
+ mutex_lock(&mdev_state->reset_mutex);
+ if (mdev_state->deferred_reset) {
+ mdev_state->deferred_reset = false;
+ mutex_unlock(&mdev_state->reset_mutex);
+ mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+ mtty_disable_files(mdev_state);
+ goto again;
+ }
+ mutex_unlock(&mdev_state->state_mutex);
+ mutex_unlock(&mdev_state->reset_mutex);
+}
+
+static int mtty_release_migf(struct inode *inode, struct file *filp)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+
+ mtty_disable_file(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+
+ return 0;
+}
+
+static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ struct mdev_state *mdev_state = migf->mdev_state;
+ loff_t *pos = &filp->f_pos;
+ struct vfio_precopy_info info = {};
+ unsigned long minsz;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->state_mutex);
+ if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ mutex_lock(&migf->lock);
+
+ if (migf->disabled) {
+ mutex_unlock(&migf->lock);
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ if (*pos > migf->filled_size) {
+ mutex_unlock(&migf->lock);
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ info.dirty_bytes = 0;
+ info.initial_bytes = migf->filled_size - *pos;
+ mutex_unlock(&migf->lock);
+
+ ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+unlock:
+ mtty_state_mutex_unlock(mdev_state);
+ return ret;
+}
+
+static ssize_t mtty_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ ssize_t ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+
+ dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len);
+
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (*pos > migf->filled_size) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, migf->filled_size - *pos, len);
+ if (len) {
+ if (copy_to_user(buf, (void *)&migf->data + *pos, len)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += len;
+ ret = len;
+ }
+out_unlock:
+ dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret);
+ mutex_unlock(&migf->lock);
+ return ret;
+}
+
+static const struct file_operations mtty_save_fops = {
+ .owner = THIS_MODULE,
+ .read = mtty_save_read,
+ .unlocked_ioctl = mtty_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = mtty_release_migf,
+};
+
+static void mtty_save_state(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf = mdev_state->saving_migf;
+ int i;
+
+ mutex_lock(&migf->lock);
+ for (i = 0; i < mdev_state->nr_ports; i++) {
+ memcpy(&migf->data.ports[i],
+ &mdev_state->s[i], sizeof(struct serial_port));
+ migf->filled_size += sizeof(struct serial_port);
+ }
+ dev_dbg(mdev_state->vdev.dev,
+ "%s filled to %zu\n", __func__, migf->filled_size);
+ mutex_unlock(&migf->lock);
+}
+
+static int mtty_load_state(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf = mdev_state->resuming_migf;
+ int i;
+
+ mutex_lock(&migf->lock);
+ /* magic and version already tested by resume write fn */
+ if (migf->filled_size < mtty_data_size(mdev_state)) {
+ dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n",
+ __func__, mtty_data_size(mdev_state),
+ migf->filled_size);
+ mutex_unlock(&migf->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < mdev_state->nr_ports; i++)
+ memcpy(&mdev_state->s[i],
+ &migf->data.ports[i], sizeof(struct serial_port));
+
+ mutex_unlock(&migf->lock);
+ return 0;
+}
+
+static struct mtty_migration_file *
+mtty_save_device_data(struct mdev_state *mdev_state,
+ enum vfio_device_mig_state state)
+{
+ struct mtty_migration_file *migf = mdev_state->saving_migf;
+ struct mtty_migration_file *ret = NULL;
+
+ if (migf) {
+ if (state == VFIO_DEVICE_STATE_STOP_COPY)
+ goto fill_data;
+ return ret;
+ }
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops,
+ migf, O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ int rc = PTR_ERR(migf->filp);
+
+ kfree(migf);
+ return ERR_PTR(rc);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ migf->mdev_state = mdev_state;
+
+ migf->data.magic = MTTY_MAGIC;
+ migf->data.major_ver = MTTY_MAJOR_VER;
+ migf->data.minor_ver = MTTY_MINOR_VER;
+ migf->data.nr_ports = mdev_state->nr_ports;
+
+ migf->filled_size = offsetof(struct mtty_data, ports);
+
+ dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n",
+ __func__, migf->filled_size);
+
+ ret = mdev_state->saving_migf = migf;
+
+fill_data:
+ if (state == VFIO_DEVICE_STATE_STOP_COPY)
+ mtty_save_state(mdev_state);
+
+ return ret;
+}
+
+static ssize_t mtty_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mtty_migration_file *migf = filp->private_data;
+ struct mdev_state *mdev_state = migf->mdev_state;
+ loff_t requested_length;
+ ssize_t ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ if (requested_length > mtty_data_size(mdev_state))
+ return -ENOMEM;
+
+ mutex_lock(&migf->lock);
+
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (copy_from_user((void *)&migf->data + *pos, buf, len)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ *pos += len;
+ ret = len;
+
+ dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n",
+ __func__, len, migf->filled_size + len);
+
+ if (migf->filled_size < offsetof(struct mtty_data, ports) &&
+ migf->filled_size + len >= offsetof(struct mtty_data, ports)) {
+ if (migf->data.magic != MTTY_MAGIC || migf->data.flags ||
+ migf->data.major_ver != MTTY_MAJOR_VER ||
+ migf->data.minor_ver != MTTY_MINOR_VER ||
+ migf->data.nr_ports != mdev_state->nr_ports) {
+ dev_dbg(migf->mdev_state->vdev.dev,
+ "%s failed validation\n", __func__);
+ ret = -EFAULT;
+ } else {
+ dev_dbg(migf->mdev_state->vdev.dev,
+ "%s header validated\n", __func__);
+ }
+ }
+
+ migf->filled_size += len;
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return ret;
+}
+
+static const struct file_operations mtty_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = mtty_resume_write,
+ .release = mtty_release_migf,
+};
+
+static struct mtty_migration_file *
+mtty_resume_device_data(struct mdev_state *mdev_state)
+{
+ struct mtty_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops,
+ migf, O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ migf->mdev_state = mdev_state;
+
+ mdev_state->resuming_migf = migf;
+
+ return migf;
+}
+
+static struct file *mtty_step_state(struct mdev_state *mdev_state,
+ enum vfio_device_mig_state new)
+{
+ enum vfio_device_mig_state cur = mdev_state->state;
+
+ dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new);
+
+ /*
+ * The following state transitions are no-op considering
+ * mtty does not do DMA nor require any explicit start/stop.
+ *
+ * RUNNING -> RUNNING_P2P
+ * RUNNING_P2P -> RUNNING
+ * RUNNING_P2P -> STOP
+ * PRE_COPY -> PRE_COPY_P2P
+ * PRE_COPY_P2P -> PRE_COPY
+ * STOP -> RUNNING_P2P
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ (new == VFIO_DEVICE_STATE_RUNNING ||
+ new == VFIO_DEVICE_STATE_STOP)) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P))
+ return NULL;
+
+ /*
+ * The following state transitions simply close migration files,
+ * with the exception of RESUMING -> STOP, which needs to load
+ * the state first.
+ *
+ * RESUMING -> STOP
+ * PRE_COPY -> RUNNING
+ * PRE_COPY_P2P -> RUNNING_P2P
+ * STOP_COPY -> STOP
+ */
+ if (cur == VFIO_DEVICE_STATE_RESUMING &&
+ new == VFIO_DEVICE_STATE_STOP) {
+ int ret;
+
+ ret = mtty_load_state(mdev_state);
+ if (ret)
+ return ERR_PTR(ret);
+ mtty_disable_files(mdev_state);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_PRE_COPY &&
+ new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_STOP_COPY &&
+ new == VFIO_DEVICE_STATE_STOP)) {
+ mtty_disable_files(mdev_state);
+ return NULL;
+ }
+
+ /*
+ * The following state transitions return migration files.
+ *
+ * RUNNING -> PRE_COPY
+ * RUNNING_P2P -> PRE_COPY_P2P
+ * STOP -> STOP_COPY
+ * STOP -> RESUMING
+ * PRE_COPY_P2P -> STOP_COPY
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+ new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+ (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_STOP_COPY) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_STOP_COPY)) {
+ struct mtty_migration_file *migf;
+
+ migf = mtty_save_device_data(mdev_state, new);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+
+ if (migf) {
+ get_file(migf->filp);
+
+ return migf->filp;
+ }
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP &&
+ new == VFIO_DEVICE_STATE_RESUMING) {
+ struct mtty_migration_file *migf;
+
+ migf = mtty_resume_device_data(mdev_state);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+
+ get_file(migf->filp);
+
+ return migf->filp;
+ }
+
+ /* vfio_mig_get_next_state() does not use arcs other than the above */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *mtty_set_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct file *ret = NULL;
+
+ dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state);
+
+ mutex_lock(&mdev_state->state_mutex);
+ while (mdev_state->state != new_state) {
+ enum vfio_device_mig_state next_state;
+ int rc = vfio_mig_get_next_state(vdev, mdev_state->state,
+ new_state, &next_state);
+ if (rc) {
+ ret = ERR_PTR(rc);
+ break;
+ }
+
+ ret = mtty_step_state(mdev_state, next_state);
+ if (IS_ERR(ret))
+ break;
+
+ mdev_state->state = next_state;
+
+ if (WARN_ON(ret && new_state != next_state)) {
+ fput(ret);
+ ret = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mtty_state_mutex_unlock(mdev_state);
+ return ret;
+}
+
+static int mtty_get_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *current_state)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ mutex_lock(&mdev_state->state_mutex);
+ *current_state = mdev_state->state;
+ mtty_state_mutex_unlock(mdev_state);
+ return 0;
+}
+
+static int mtty_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ *stop_copy_length = mtty_data_size(mdev_state);
+ return 0;
+}
+
+static const struct vfio_migration_ops mtty_migration_ops = {
+ .migration_set_state = mtty_set_state,
+ .migration_get_state = mtty_get_state,
+ .migration_get_data_size = mtty_get_data_size,
+};
+
+static int mtty_log_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges,
+ u32 nnodes, u64 *page_size)
+{
+ return 0;
+}
+
+static int mtty_log_stop(struct vfio_device *vdev)
+{
+ return 0;
+}
+
+static int mtty_log_read_and_clear(struct vfio_device *vdev,
+ unsigned long iova, unsigned long length,
+ struct iova_bitmap *dirty)
+{
+ return 0;
+}
+
+static const struct vfio_log_ops mtty_log_ops = {
+ .log_start = mtty_log_start,
+ .log_stop = mtty_log_stop,
+ .log_read_and_clear = mtty_log_read_and_clear,
+};
+
+static int mtty_init_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ struct mdev_device *mdev = to_mdev_device(vdev->dev);
+ struct mtty_type *type =
+ container_of(mdev->type, struct mtty_type, type);
+ int avail_ports = atomic_read(&mdev_avail_ports);
+ int ret;
+
+ do {
+ if (avail_ports < type->nr_ports)
+ return -ENOSPC;
+ } while (!atomic_try_cmpxchg(&mdev_avail_ports,
+ &avail_ports,
+ avail_ports - type->nr_ports));
+
+ mdev_state->nr_ports = type->nr_ports;
+ mdev_state->irq_index = -1;
+ mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE;
+ mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE;
+ mutex_init(&mdev_state->rxtx_lock);
+
+ mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL);
+ if (!mdev_state->vconfig) {
+ ret = -ENOMEM;
+ goto err_nr_ports;
+ }
+
+ mutex_init(&mdev_state->ops_lock);
+ mdev_state->mdev = mdev;
+ mtty_create_config_space(mdev_state);
+
+ mutex_init(&mdev_state->state_mutex);
+ mutex_init(&mdev_state->reset_mutex);
+ vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ vdev->mig_ops = &mtty_migration_ops;
+ vdev->log_ops = &mtty_log_ops;
+ mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+
+ return 0;
+
+err_nr_ports:
+ atomic_add(type->nr_ports, &mdev_avail_ports);
+ return ret;
+}
+
+static int mtty_probe(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state;
+ int ret;
+
+ mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev,
+ &mtty_dev_ops);
+ if (IS_ERR(mdev_state))
+ return PTR_ERR(mdev_state);
+
+ ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev);
+ if (ret)
+ goto err_put_vdev;
+ dev_set_drvdata(&mdev->dev, mdev_state);
+ return 0;
+
+err_put_vdev:
+ vfio_put_device(&mdev_state->vdev);
+ return ret;
+}
+
+static void mtty_release_dev(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ mutex_destroy(&mdev_state->reset_mutex);
+ mutex_destroy(&mdev_state->state_mutex);
+ atomic_add(mdev_state->nr_ports, &mdev_avail_ports);
+ kfree(mdev_state->vconfig);
+}
+
+static void mtty_remove(struct mdev_device *mdev)
+{
+ struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev);
+
+ vfio_unregister_group_dev(&mdev_state->vdev);
+ vfio_put_device(&mdev_state->vdev);
+}
+
+static int mtty_reset(struct mdev_state *mdev_state)
+{
+ pr_info("%s: called\n", __func__);
+
+ mutex_lock(&mdev_state->reset_mutex);
+ mdev_state->deferred_reset = true;
+ if (!mutex_trylock(&mdev_state->state_mutex)) {
+ mutex_unlock(&mdev_state->reset_mutex);
+ return 0;
+ }
+ mutex_unlock(&mdev_state->reset_mutex);
+ mtty_state_mutex_unlock(mdev_state);
+
+ return 0;
+}
+
+static ssize_t mtty_read(struct vfio_device *vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, false);
+ if (ret <= 0)
+ goto read_err;
+
+ if (copy_to_user(buf, &val, sizeof(val)))
+ goto read_err;
+
+ filled = 1;
+ }
+
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+
+read_err:
+ return -EFAULT;
+}
+
+static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 4 && !(*ppos % 4)) {
+ u32 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ u16 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 2;
+ } else {
+ u8 val;
+
+ if (copy_from_user(&val, buf, sizeof(val)))
+ goto write_err;
+
+ ret = mdev_access(mdev_state, (u8 *)&val, sizeof(val),
+ *ppos, true);
+ if (ret <= 0)
+ goto write_err;
+
+ filled = 1;
+ }
+ count -= filled;
+ done += filled;
+ *ppos += filled;
+ buf += filled;
+ }
+
+ return done;
+write_err:
+ return -EFAULT;
+}
+
+static void mtty_disable_intx(struct mdev_state *mdev_state)
+{
+ if (mdev_state->intx_evtfd) {
+ eventfd_ctx_put(mdev_state->intx_evtfd);
+ mdev_state->intx_evtfd = NULL;
+ mdev_state->intx_mask = false;
+ mdev_state->irq_index = -1;
+ }
+}
+
+static void mtty_disable_msi(struct mdev_state *mdev_state)
+{
+ if (mdev_state->msi_evtfd) {
+ eventfd_ctx_put(mdev_state->msi_evtfd);
+ mdev_state->msi_evtfd = NULL;
+ mdev_state->irq_index = -1;
+ }
+}
+
+static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
+ unsigned int index, unsigned int start,
+ unsigned int count, void *data)
+{
+ int ret = 0;
+
+ mutex_lock(&mdev_state->ops_lock);
+ switch (index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ if (!is_intx(mdev_state) || start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mdev_state->intx_mask = true;
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t mask = *(uint8_t *)data;
+
+ if (mask)
+ mdev_state->intx_mask = true;
+ } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ ret = -ENOTTY; /* No support for mask fd */
+ }
+ break;
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ if (!is_intx(mdev_state) || start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mdev_state->intx_mask = false;
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t mask = *(uint8_t *)data;
+
+ if (mask)
+ mdev_state->intx_mask = false;
+ } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ ret = -ENOTTY; /* No support for unmask fd */
+ }
+ break;
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ if (is_intx(mdev_state) && !count &&
+ (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ mtty_disable_intx(mdev_state);
+ break;
+ }
+
+ if (!(is_intx(mdev_state) || is_noirq(mdev_state)) ||
+ start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int fd = *(int *)data;
+ struct eventfd_ctx *evt;
+
+ mtty_disable_intx(mdev_state);
+
+ if (fd < 0)
+ break;
+
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt)) {
+ ret = PTR_ERR(evt);
+ break;
+ }
+ mdev_state->intx_evtfd = evt;
+ mdev_state->irq_index = index;
+ break;
+ }
+
+ if (!is_intx(mdev_state)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mtty_trigger_interrupt(mdev_state);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t trigger = *(uint8_t *)data;
+
+ if (trigger)
+ mtty_trigger_interrupt(mdev_state);
+ }
+ break;
+ }
+ break;
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK:
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ ret = -ENOTTY;
+ break;
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ if (is_msi(mdev_state) && !count &&
+ (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ mtty_disable_msi(mdev_state);
+ break;
+ }
+
+ if (!(is_msi(mdev_state) || is_noirq(mdev_state)) ||
+ start != 0 || count != 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ int fd = *(int *)data;
+ struct eventfd_ctx *evt;
+
+ mtty_disable_msi(mdev_state);
+
+ if (fd < 0)
+ break;
+
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt)) {
+ ret = PTR_ERR(evt);
+ break;
+ }
+ mdev_state->msi_evtfd = evt;
+ mdev_state->irq_index = index;
+ break;
+ }
+
+ if (!is_msi(mdev_state)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ mtty_trigger_interrupt(mdev_state);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t trigger = *(uint8_t *)data;
+
+ if (trigger)
+ mtty_trigger_interrupt(mdev_state);
+ }
+ break;
+ }
+ break;
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__);
+ ret = -ENOTTY;
+ break;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__);
+ ret = -ENOTTY;
+ break;
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__);
+ ret = -ENOTTY;
+ break;
+ }
+
+ mutex_unlock(&mdev_state->ops_lock);
+ return ret;
+}
+
+static int mtty_ioctl_get_region_info(struct vfio_device *vdev,
+ struct vfio_region_info *region_info,
+ struct vfio_info_cap *caps)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ unsigned int size = 0;
+ u32 bar_index;
+
+ bar_index = region_info->index;
+ if (bar_index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ mutex_lock(&mdev_state->ops_lock);
+
+ switch (bar_index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ size = MTTY_CONFIG_SPACE_SIZE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ size = MTTY_IO_BAR_SIZE;
+ break;
+ case VFIO_PCI_BAR1_REGION_INDEX:
+ if (mdev_state->nr_ports == 2)
+ size = MTTY_IO_BAR_SIZE;
+ break;
+ default:
+ size = 0;
+ break;
+ }
+
+ mdev_state->region_info[bar_index].size = size;
+ mdev_state->region_info[bar_index].vfio_offset =
+ MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
+
+ region_info->size = size;
+ region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
+ region_info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ mutex_unlock(&mdev_state->ops_lock);
+ return 0;
+}
+
+static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
+{
+ if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX &&
+ irq_info->index != VFIO_PCI_MSI_IRQ_INDEX)
+ return -EINVAL;
+
+ irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
+ irq_info->count = 1;
+
+ if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
+ irq_info->flags |= VFIO_IRQ_INFO_MASKABLE |
+ VFIO_IRQ_INFO_AUTOMASKED;
+ else
+ irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
+
+ return 0;
+}
+
+static int mtty_get_device_info(struct vfio_device_info *dev_info)
+{
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
+ dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
+ dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
+
+ return 0;
+}
+
+static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+ int ret = 0;
+ unsigned long minsz;
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ ret = mtty_get_device_info(&info);
+ if (ret)
+ return ret;
+
+ memcpy(&mdev_state->dev_info, &info, sizeof(info));
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if ((info.argsz < minsz) ||
+ (info.index >= mdev_state->dev_info.num_irqs))
+ return -EINVAL;
+
+ ret = mtty_get_irq_info(&info);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+
+ return 0;
+ }
+ case VFIO_DEVICE_SET_IRQS:
+ {
+ struct vfio_irq_set hdr;
+ u8 *data = NULL, *ptr = NULL;
+ size_t data_size = 0;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr,
+ mdev_state->dev_info.num_irqs,
+ VFIO_PCI_NUM_IRQS,
+ &data_size);
+ if (ret)
+ return ret;
+
+ if (data_size) {
+ ptr = data = memdup_user((void __user *)(arg + minsz),
+ data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
+
+ ret = mtty_set_irqs(mdev_state, hdr.flags, hdr.index, hdr.start,
+ hdr.count, data);
+
+ kfree(ptr);
+ return ret;
+ }
+ case VFIO_DEVICE_RESET:
+ return mtty_reset(mdev_state);
+ }
+ return -ENOTTY;
+}
+
+static ssize_t
+sample_mdev_dev_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "This is MDEV %s\n", dev_name(dev));
+}
+
+static DEVICE_ATTR_RO(sample_mdev_dev);
+
+static struct attribute *mdev_dev_attrs[] = {
+ &dev_attr_sample_mdev_dev.attr,
+ NULL,
+};
+
+static const struct attribute_group mdev_dev_group = {
+ .name = "vendor",
+ .attrs = mdev_dev_attrs,
+};
+
+static const struct attribute_group *mdev_dev_groups[] = {
+ &mdev_dev_group,
+ NULL,
+};
+
+static unsigned int mtty_get_available(struct mdev_type *mtype)
+{
+ struct mtty_type *type = container_of(mtype, struct mtty_type, type);
+
+ return atomic_read(&mdev_avail_ports) / type->nr_ports;
+}
+
+static void mtty_close(struct vfio_device *vdev)
+{
+ struct mdev_state *mdev_state =
+ container_of(vdev, struct mdev_state, vdev);
+
+ mtty_disable_files(mdev_state);
+ mtty_disable_intx(mdev_state);
+ mtty_disable_msi(mdev_state);
+}
+
+static const struct vfio_device_ops mtty_dev_ops = {
+ .name = "vfio-mtty",
+ .init = mtty_init_dev,
+ .release = mtty_release_dev,
+ .read = mtty_read,
+ .write = mtty_write,
+ .ioctl = mtty_ioctl,
+ .get_region_info_caps = mtty_ioctl_get_region_info,
+ .bind_iommufd = vfio_iommufd_emulated_bind,
+ .unbind_iommufd = vfio_iommufd_emulated_unbind,
+ .attach_ioas = vfio_iommufd_emulated_attach_ioas,
+ .detach_ioas = vfio_iommufd_emulated_detach_ioas,
+ .close_device = mtty_close,
+};
+
+static struct mdev_driver mtty_driver = {
+ .device_api = VFIO_DEVICE_API_PCI_STRING,
+ .driver = {
+ .name = "mtty",
+ .owner = THIS_MODULE,
+ .mod_name = KBUILD_MODNAME,
+ .dev_groups = mdev_dev_groups,
+ },
+ .probe = mtty_probe,
+ .remove = mtty_remove,
+ .get_available = mtty_get_available,
+};
+
+static void mtty_device_release(struct device *dev)
+{
+ dev_dbg(dev, "mtty: released\n");
+}
+
+static int __init mtty_dev_init(void)
+{
+ int ret = 0;
+
+ pr_info("mtty_dev: %s\n", __func__);
+
+ memset(&mtty_dev, 0, sizeof(mtty_dev));
+
+ idr_init(&mtty_dev.vd_idr);
+
+ ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK + 1,
+ MTTY_NAME);
+
+ if (ret < 0) {
+ pr_err("Error: failed to register mtty_dev, err:%d\n", ret);
+ return ret;
+ }
+
+ cdev_init(&mtty_dev.vd_cdev, &vd_fops);
+ cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK + 1);
+
+ pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt));
+
+ ret = mdev_register_driver(&mtty_driver);
+ if (ret)
+ goto err_cdev;
+
+ mtty_dev.vd_class = class_create(MTTY_CLASS_NAME);
+
+ if (IS_ERR(mtty_dev.vd_class)) {
+ pr_err("Error: failed to register mtty_dev class\n");
+ ret = PTR_ERR(mtty_dev.vd_class);
+ goto err_driver;
+ }
+
+ mtty_dev.dev.class = mtty_dev.vd_class;
+ mtty_dev.dev.release = mtty_device_release;
+ dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME);
+
+ ret = device_register(&mtty_dev.dev);
+ if (ret)
+ goto err_put;
+
+ ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev,
+ &mtty_driver, mtty_mdev_types,
+ ARRAY_SIZE(mtty_mdev_types));
+ if (ret)
+ goto err_device;
+ return 0;
+
+err_device:
+ device_del(&mtty_dev.dev);
+err_put:
+ put_device(&mtty_dev.dev);
+ class_destroy(mtty_dev.vd_class);
+err_driver:
+ mdev_unregister_driver(&mtty_driver);
+err_cdev:
+ cdev_del(&mtty_dev.vd_cdev);
+ unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
+ return ret;
+}
+
+static void __exit mtty_dev_exit(void)
+{
+ mtty_dev.dev.bus = NULL;
+ mdev_unregister_parent(&mtty_dev.parent);
+
+ device_unregister(&mtty_dev.dev);
+ idr_destroy(&mtty_dev.vd_idr);
+ mdev_unregister_driver(&mtty_driver);
+ cdev_del(&mtty_dev.vd_cdev);
+ unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
+ class_destroy(mtty_dev.vd_class);
+ mtty_dev.vd_class = NULL;
+ pr_info("mtty_dev: Unloaded!\n");
+}
+
+module_init(mtty_dev_init)
+module_exit(mtty_dev_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Test driver that simulate serial port over PCI");
+MODULE_VERSION(VERSION_STRING);
+MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore
new file mode 100644
index 000000000000..8708341bc082
--- /dev/null
+++ b/samples/vfs/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/test-fsmount
+/test-list-all-mounts
+/test-statx
+/mountinfo
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
new file mode 100644
index 000000000000..9256ca5d762b
--- /dev/null
+++ b/samples/vfs/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts
+
+userccflags += -I $(srctree)/tools/testing/selftests/
+userccflags += -I usr/include
diff --git a/samples/vfs/mountinfo.c b/samples/vfs/mountinfo.c
new file mode 100644
index 000000000000..bc78275cac69
--- /dev/null
+++ b/samples/vfs/mountinfo.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Use pidfds, nsfds, listmount() and statmount() mimic the
+ * contents of /proc/self/mountinfo.
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <alloca.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "samples-vfs.h"
+
+/* max mounts per listmount call */
+#define MAXMOUNTS 1024
+
+/* size of struct statmount (including trailing string buffer) */
+#define STATMOUNT_BUFSIZE 4096
+
+static bool ext_format;
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+/*
+ * There are no bindings in glibc for listmount() and statmount() (yet),
+ * make our own here.
+ */
+static int statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
+ struct statmount *buf, size_t bufsize,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = mask,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_statmount, &req, buf, bufsize, flags);
+}
+
+static ssize_t listmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 last_mnt_id,
+ __u64 list[], size_t num, unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER0,
+ .mnt_id = mnt_id,
+ .param = last_mnt_id,
+ };
+
+ if (mnt_ns_id) {
+ req.size = MNT_ID_REQ_SIZE_VER1;
+ req.mnt_ns_id = mnt_ns_id;
+ }
+
+ return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+static void show_mnt_attrs(__u64 flags)
+{
+ printf("%s", flags & MOUNT_ATTR_RDONLY ? "ro" : "rw");
+
+ if (flags & MOUNT_ATTR_NOSUID)
+ printf(",nosuid");
+ if (flags & MOUNT_ATTR_NODEV)
+ printf(",nodev");
+ if (flags & MOUNT_ATTR_NOEXEC)
+ printf(",noexec");
+
+ switch (flags & MOUNT_ATTR__ATIME) {
+ case MOUNT_ATTR_RELATIME:
+ printf(",relatime");
+ break;
+ case MOUNT_ATTR_NOATIME:
+ printf(",noatime");
+ break;
+ case MOUNT_ATTR_STRICTATIME:
+ /* print nothing */
+ break;
+ }
+
+ if (flags & MOUNT_ATTR_NODIRATIME)
+ printf(",nodiratime");
+ if (flags & MOUNT_ATTR_NOSYMFOLLOW)
+ printf(",nosymfollow");
+ if (flags & MOUNT_ATTR_IDMAP)
+ printf(",idmapped");
+}
+
+static void show_propagation(struct statmount *sm)
+{
+ if (sm->mnt_propagation & MS_SHARED)
+ printf(" shared:%llu", sm->mnt_peer_group);
+ if (sm->mnt_propagation & MS_SLAVE) {
+ printf(" master:%llu", sm->mnt_master);
+ if (sm->propagate_from && sm->propagate_from != sm->mnt_master)
+ printf(" propagate_from:%llu", sm->propagate_from);
+ }
+ if (sm->mnt_propagation & MS_UNBINDABLE)
+ printf(" unbindable");
+}
+
+static void show_sb_flags(__u64 flags)
+{
+ printf("%s", flags & MS_RDONLY ? "ro" : "rw");
+ if (flags & MS_SYNCHRONOUS)
+ printf(",sync");
+ if (flags & MS_DIRSYNC)
+ printf(",dirsync");
+ if (flags & MS_MANDLOCK)
+ printf(",mand");
+ if (flags & MS_LAZYTIME)
+ printf(",lazytime");
+}
+
+static int dump_mountinfo(__u64 mnt_id, __u64 mnt_ns_id)
+{
+ int ret;
+ struct statmount *buf = alloca(STATMOUNT_BUFSIZE);
+ const __u64 mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC |
+ STATMOUNT_PROPAGATE_FROM | STATMOUNT_FS_TYPE |
+ STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT |
+ STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE |
+ STATMOUNT_SB_SOURCE;
+
+ ret = statmount(mnt_id, mnt_ns_id, mask, buf, STATMOUNT_BUFSIZE, 0);
+ if (ret < 0) {
+ perror("statmount");
+ return 1;
+ }
+
+ if (ext_format)
+ printf("0x%llx 0x%llx 0x%llx ", mnt_ns_id, mnt_id, buf->mnt_parent_id);
+
+ printf("%u %u %u:%u %s %s ", buf->mnt_id_old, buf->mnt_parent_id_old,
+ buf->sb_dev_major, buf->sb_dev_minor,
+ &buf->str[buf->mnt_root],
+ &buf->str[buf->mnt_point]);
+ show_mnt_attrs(buf->mnt_attr);
+ show_propagation(buf);
+
+ printf(" - %s", &buf->str[buf->fs_type]);
+ if (buf->mask & STATMOUNT_FS_SUBTYPE)
+ printf(".%s", &buf->str[buf->fs_subtype]);
+ if (buf->mask & STATMOUNT_SB_SOURCE)
+ printf(" %s ", &buf->str[buf->sb_source]);
+ else
+ printf(" :none ");
+
+ show_sb_flags(buf->sb_flags);
+ if (buf->mask & STATMOUNT_MNT_OPTS)
+ printf(",%s", &buf->str[buf->mnt_opts]);
+ printf("\n");
+ return 0;
+}
+
+static int dump_mounts(__u64 mnt_ns_id)
+{
+ __u64 mntid[MAXMOUNTS];
+ __u64 last_mnt_id = 0;
+ ssize_t count;
+ int i;
+
+ /*
+ * Get a list of all mntids in mnt_ns_id. If it returns MAXMOUNTS
+ * mounts, then go again until we get everything.
+ */
+ do {
+ count = listmount(LSMT_ROOT, mnt_ns_id, last_mnt_id, mntid, MAXMOUNTS, 0);
+ if (count < 0 || count > MAXMOUNTS) {
+ errno = count < 0 ? errno : count;
+ perror("listmount");
+ return 1;
+ }
+
+ /* Walk the returned mntids and print info about each */
+ for (i = 0; i < count; ++i) {
+ int ret = dump_mountinfo(mntid[i], mnt_ns_id);
+
+ if (ret != 0)
+ return ret;
+ }
+ /* Set up last_mnt_id to pick up where we left off */
+ last_mnt_id = mntid[count - 1];
+ } while (count == MAXMOUNTS);
+ return 0;
+}
+
+static void usage(const char * const prog)
+{
+ printf("Usage:\n");
+ printf("%s [-e] [-p pid] [-r] [-h]\n", prog);
+ printf(" -e: extended format\n");
+ printf(" -h: print usage message\n");
+ printf(" -p: get mount namespace from given pid\n");
+ printf(" -r: recursively print all mounts in all child namespaces\n");
+}
+
+int main(int argc, char * const *argv)
+{
+ struct mnt_ns_info mni = { .size = MNT_NS_INFO_SIZE_VER0 };
+ int pidfd, mntns, ret, opt;
+ pid_t pid = getpid();
+ bool recursive = false;
+
+ while ((opt = getopt(argc, argv, "ehp:r")) != -1) {
+ switch (opt) {
+ case 'e':
+ ext_format = true;
+ break;
+ case 'h':
+ usage(argv[0]);
+ return 0;
+ case 'p':
+ pid = atoi(optarg);
+ break;
+ case 'r':
+ recursive = true;
+ break;
+ }
+ }
+
+ /* Get a pidfd for pid */
+ pidfd = syscall(__NR_pidfd_open, pid, 0);
+ if (pidfd < 0) {
+ perror("pidfd_open");
+ return 1;
+ }
+
+ /* Get the mnt namespace for pidfd */
+ mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, NULL);
+ if (mntns < 0) {
+ perror("PIDFD_GET_MNT_NAMESPACE");
+ return 1;
+ }
+ close(pidfd);
+
+ /* get info about mntns. In particular, the mnt_ns_id */
+ ret = ioctl(mntns, NS_MNT_GET_INFO, &mni);
+ if (ret < 0) {
+ perror("NS_MNT_GET_INFO");
+ return 1;
+ }
+
+ do {
+ int ret;
+
+ ret = dump_mounts(mni.mnt_ns_id);
+ if (ret)
+ return ret;
+
+ if (!recursive)
+ break;
+
+ /* get the next mntns (and overwrite the old mount ns info) */
+ ret = ioctl(mntns, NS_MNT_GET_NEXT, &mni);
+ close(mntns);
+ mntns = ret;
+ } while (mntns >= 0);
+
+ return 0;
+}
diff --git a/samples/vfs/samples-vfs.h b/samples/vfs/samples-vfs.h
new file mode 100644
index 000000000000..498baf581b56
--- /dev/null
+++ b/samples/vfs/samples-vfs.h
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SAMPLES_VFS_H
+#define __SAMPLES_VFS_H
+
+#include <errno.h>
+#include <linux/types.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#define die_errno(format, ...) \
+ do { \
+ fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \
+ __LINE__, __func__, ##__VA_ARGS__); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+struct statmount {
+ __u32 size; /* Total size, including strings */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
+ __u64 mask; /* What results were written */
+ __u32 sb_dev_major; /* Device ID */
+ __u32 sb_dev_minor;
+ __u64 sb_magic; /* ..._SUPER_MAGIC */
+ __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+ __u32 fs_type; /* [str] Filesystem type */
+ __u64 mnt_id; /* Unique ID of mount */
+ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
+ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
+ __u32 mnt_parent_id_old;
+ __u64 mnt_attr; /* MOUNT_ATTR_... */
+ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+ __u64 mnt_peer_group; /* ID of shared peer group */
+ __u64 mnt_master; /* Mount receives propagation from this ID */
+ __u64 propagate_from; /* Propagation from in current namespace */
+ __u32 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u32 mnt_point; /* [str] Mountpoint relative to current root */
+ __u64 mnt_ns_id; /* ID of the mount namespace */
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u32 mnt_uidmap_num; /* Number of uid mappings */
+ __u32 mnt_uidmap; /* [str] Array of uid mappings */
+ __u32 mnt_gidmap_num; /* Number of gid mappings */
+ __u32 mnt_gidmap; /* [str] Array of gid mappings */
+ __u64 __spare2[44];
+ char str[]; /* Variable size part containing strings */
+};
+
+struct mnt_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 mnt_id;
+ __u64 param;
+ __u64 mnt_ns_id;
+};
+
+#ifndef MNT_ID_REQ_SIZE_VER0
+#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */
+#endif
+
+#ifndef MNT_ID_REQ_SIZE_VER1
+#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
+#endif
+
+/* Get the id for a mount namespace */
+#ifndef NS_GET_MNTNS_ID
+#define NS_GET_MNTNS_ID _IO(0xb7, 0x5)
+#endif
+
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#ifndef MNT_NS_INFO_SIZE_VER0
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+#endif
+
+#ifndef NS_MNT_GET_INFO
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+#endif
+
+#ifndef NS_MNT_GET_NEXT
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+#endif
+
+#ifndef NS_MNT_GET_PREV
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+#endif
+
+#ifndef PIDFD_GET_MNT_NAMESPACE
+#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
+#endif
+
+#ifndef __NR_listmount
+#define __NR_listmount 458
+#endif
+
+#ifndef __NR_statmount
+#define __NR_statmount 457
+#endif
+
+#ifndef LSMT_ROOT
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+#endif
+
+/* @mask bits for statmount(2) */
+#ifndef STATMOUNT_SB_BASIC
+#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */
+#endif
+
+#ifndef STATMOUNT_MNT_BASIC
+#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
+#endif
+
+#ifndef STATMOUNT_PROPAGATE_FROM
+#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
+#endif
+
+#ifndef STATMOUNT_MNT_ROOT
+#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
+#endif
+
+#ifndef STATMOUNT_MNT_POINT
+#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */
+#endif
+
+#ifndef STATMOUNT_FS_TYPE
+#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
+#endif
+
+#ifndef STATMOUNT_MNT_NS_ID
+#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
+#endif
+
+#ifndef STATMOUNT_MNT_OPTS
+#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#endif
+
+#ifndef STATMOUNT_FS_SUBTYPE
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#endif
+
+#ifndef STATMOUNT_SB_SOURCE
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#endif
+
+#ifndef STATMOUNT_OPT_ARRAY
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#endif
+
+#ifndef STATMOUNT_OPT_SEC_ARRAY
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
+#endif
+
+#ifndef STATX_MNT_ID_UNIQUE
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
+#endif
+
+#ifndef STATMOUNT_MNT_UIDMAP
+#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */
+#endif
+
+#ifndef STATMOUNT_MNT_GIDMAP
+#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */
+#endif
+
+#ifndef MOUNT_ATTR_RDONLY
+#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */
+#endif
+
+#ifndef MOUNT_ATTR_NOSUID
+#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */
+#endif
+
+#ifndef MOUNT_ATTR_NODEV
+#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */
+#endif
+
+#ifndef MOUNT_ATTR_NOEXEC
+#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */
+#endif
+
+#ifndef MOUNT_ATTR__ATIME
+#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */
+#endif
+
+#ifndef MOUNT_ATTR_RELATIME
+#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */
+#endif
+
+#ifndef MOUNT_ATTR_NOATIME
+#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */
+#endif
+
+#ifndef MOUNT_ATTR_STRICTATIME
+#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */
+#endif
+
+#ifndef MOUNT_ATTR_NODIRATIME
+#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
+#endif
+
+#ifndef MOUNT_ATTR_IDMAP
+#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
+#endif
+
+#ifndef MOUNT_ATTR_NOSYMFOLLOW
+#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */
+#endif
+
+#ifndef MS_RDONLY
+#define MS_RDONLY 1 /* Mount read-only */
+#endif
+
+#ifndef MS_SYNCHRONOUS
+#define MS_SYNCHRONOUS 16 /* Writes are synced at once */
+#endif
+
+#ifndef MS_MANDLOCK
+#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
+#endif
+
+#ifndef MS_DIRSYNC
+#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#endif
+
+#ifndef MS_UNBINDABLE
+#define MS_UNBINDABLE (1<<17) /* change to unbindable */
+#endif
+
+#ifndef MS_PRIVATE
+#define MS_PRIVATE (1<<18) /* change to private */
+#endif
+
+#ifndef MS_SLAVE
+#define MS_SLAVE (1<<19) /* change to slave */
+#endif
+
+#ifndef MS_SHARED
+#define MS_SHARED (1<<20) /* change to shared */
+#endif
+
+#ifndef MS_LAZYTIME
+#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+#endif
+
+#endif /* __SAMPLES_VFS_H */
diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c
new file mode 100644
index 000000000000..50f47b72e85f
--- /dev/null
+++ b/samples/vfs/test-fsmount.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* fd-based mount test.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <linux/mount.h>
+#include <linux/unistd.h>
+
+#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
+
+static void check_messages(int fd)
+{
+ char buf[4096];
+ int err, n;
+
+ err = errno;
+
+ for (;;) {
+ n = read(fd, buf, sizeof(buf));
+ if (n < 0)
+ break;
+ n -= 2;
+
+ switch (buf[0]) {
+ case 'e':
+ fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2);
+ break;
+ case 'w':
+ fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2);
+ break;
+ case 'i':
+ fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2);
+ break;
+ }
+ }
+
+ errno = err;
+}
+
+static __attribute__((noreturn))
+void mount_error(int fd, const char *s)
+{
+ check_messages(fd);
+ fprintf(stderr, "%s: %m\n", s);
+ exit(1);
+}
+
+/* Hope -1 isn't a syscall */
+#ifndef __NR_fsopen
+#define __NR_fsopen -1
+#endif
+#ifndef __NR_fsmount
+#define __NR_fsmount -1
+#endif
+#ifndef __NR_fsconfig
+#define __NR_fsconfig -1
+#endif
+#ifndef __NR_move_mount
+#define __NR_move_mount -1
+#endif
+
+
+static inline int fsopen(const char *fs_name, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fs_name, flags);
+}
+
+static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags)
+{
+ return syscall(__NR_fsmount, fsfd, flags, ms_flags);
+}
+
+static inline int fsconfig(int fsfd, unsigned int cmd,
+ const char *key, const void *val, int aux)
+{
+ return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux);
+}
+
+static inline int move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount,
+ from_dfd, from_pathname,
+ to_dfd, to_pathname, flags);
+}
+
+#define E_fsconfig(fd, cmd, key, val, aux) \
+ do { \
+ if (fsconfig(fd, cmd, key, val, aux) == -1) \
+ mount_error(fd, key ?: "create"); \
+ } while (0)
+
+int main(int argc, char *argv[])
+{
+ int fsfd, mfd;
+
+ /* Mount a publically available AFS filesystem */
+ fsfd = fsopen("afs", 0);
+ if (fsfd == -1) {
+ perror("fsopen");
+ exit(1);
+ }
+
+ E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0);
+ E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
+
+ mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY);
+ if (mfd < 0)
+ mount_error(fsfd, "fsmount");
+ E(close(fsfd));
+
+ if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) {
+ perror("move_mount");
+ exit(1);
+ }
+
+ E(close(mfd));
+ exit(0);
+}
diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c
new file mode 100644
index 000000000000..713c174626aa
--- /dev/null
+++ b/samples/vfs/test-list-all-mounts.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "../../tools/testing/selftests/pidfd/pidfd.h"
+#include "samples-vfs.h"
+
+static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
+ struct statmount *stmnt, size_t bufsize,
+ unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER1,
+ .mnt_id = mnt_id,
+ .param = mask,
+ .mnt_ns_id = mnt_ns_id,
+ };
+
+ return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
+}
+
+static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id,
+ __u64 mask, unsigned int flags)
+{
+ size_t bufsize = 1 << 15;
+ struct statmount *stmnt = NULL, *tmp = NULL;
+ int ret;
+
+ for (;;) {
+ tmp = realloc(stmnt, bufsize);
+ if (!tmp)
+ goto out;
+
+ stmnt = tmp;
+ ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags);
+ if (!ret)
+ return stmnt;
+
+ if (errno != EOVERFLOW)
+ goto out;
+
+ bufsize <<= 1;
+ if (bufsize >= UINT_MAX / 2)
+ goto out;
+ }
+
+out:
+ free(stmnt);
+ return NULL;
+}
+
+static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id,
+ __u64 list[], size_t num, unsigned int flags)
+{
+ struct mnt_id_req req = {
+ .size = MNT_ID_REQ_SIZE_VER1,
+ .mnt_id = mnt_id,
+ .param = last_mnt_id,
+ .mnt_ns_id = mnt_ns_id,
+ };
+
+ return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+int main(int argc, char *argv[])
+{
+#define LISTMNT_BUFFER 10
+ __u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
+ int ret, pidfd, fd_mntns;
+ struct mnt_ns_info info = {};
+
+ pidfd = sys_pidfd_open(getpid(), 0);
+ if (pidfd < 0)
+ die_errno("pidfd_open failed");
+
+ fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
+ if (fd_mntns < 0)
+ die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
+
+ ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
+ if (ret < 0)
+ die_errno("ioctl(NS_GET_MNTNS_ID) failed");
+
+ printf("Listing %u mounts for mount namespace %" PRIu64 "\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ for (;;) {
+ ssize_t nr_mounts;
+next:
+ nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
+ info.mnt_ns_id, list, LISTMNT_BUFFER,
+ 0);
+ if (nr_mounts <= 0) {
+ int fd_mntns_next;
+
+ printf("Finished listing %u mounts for mount namespace %" PRIu64 "\n\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info);
+ if (fd_mntns_next < 0) {
+ if (errno == ENOENT) {
+ printf("Finished listing all mount namespaces\n");
+ exit(0);
+ }
+ die_errno("ioctl(NS_MNT_GET_NEXT) failed");
+ }
+ close(fd_mntns);
+ fd_mntns = fd_mntns_next;
+ last_mnt_id = 0;
+ printf("Listing %u mounts for mount namespace %" PRIu64 "\n",
+ info.nr_mounts, (uint64_t)info.mnt_ns_id);
+ goto next;
+ }
+
+ for (size_t cur = 0; cur < nr_mounts; cur++) {
+ struct statmount *stmnt;
+
+ last_mnt_id = list[cur];
+
+ stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id,
+ STATMOUNT_SB_BASIC |
+ STATMOUNT_MNT_BASIC |
+ STATMOUNT_MNT_ROOT |
+ STATMOUNT_MNT_POINT |
+ STATMOUNT_MNT_NS_ID |
+ STATMOUNT_MNT_OPTS |
+ STATMOUNT_FS_TYPE |
+ STATMOUNT_MNT_UIDMAP |
+ STATMOUNT_MNT_GIDMAP, 0);
+ if (!stmnt) {
+ printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n",
+ (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id);
+ continue;
+ }
+
+ printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n",
+ (uint64_t)stmnt->mnt_id,
+ (uint64_t)stmnt->mnt_parent_id,
+ (stmnt->mask & STATMOUNT_FS_TYPE) ? stmnt->str + stmnt->fs_type : "",
+ (stmnt->mask & STATMOUNT_MNT_ROOT) ? stmnt->str + stmnt->mnt_root : "",
+ (stmnt->mask & STATMOUNT_MNT_POINT) ? stmnt->str + stmnt->mnt_point : "",
+ (stmnt->mask & STATMOUNT_MNT_OPTS) ? stmnt->str + stmnt->mnt_opts : "");
+
+ if (stmnt->mask & STATMOUNT_MNT_UIDMAP) {
+ const char *idmap = stmnt->str + stmnt->mnt_uidmap;
+
+ for (size_t idx = 0; idx < stmnt->mnt_uidmap_num; idx++) {
+ printf("mnt_uidmap[%zu]:\t%s\n", idx, idmap);
+ idmap += strlen(idmap) + 1;
+ }
+ }
+
+ if (stmnt->mask & STATMOUNT_MNT_GIDMAP) {
+ const char *idmap = stmnt->str + stmnt->mnt_gidmap;
+
+ for (size_t idx = 0; idx < stmnt->mnt_gidmap_num; idx++) {
+ printf("mnt_gidmap[%zu]:\t%s\n", idx, idmap);
+ idmap += strlen(idmap) + 1;
+ }
+ }
+
+ printf("\n");
+
+ free(stmnt);
+ }
+ }
+
+ exit(0);
+}
diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c
new file mode 100644
index 000000000000..424a6fa15723
--- /dev/null
+++ b/samples/vfs/test-statx.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Test the statx() system call.
+ *
+ * Note that the output of this program is intended to look like the output of
+ * /bin/stat where possible.
+ *
+ * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+// Work around glibc header silliness
+#undef AT_RENAME_NOREPLACE
+#undef AT_RENAME_EXCHANGE
+#undef AT_RENAME_WHITEOUT
+
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#define statx foo
+#define statx_timestamp foo_timestamp
+struct statx;
+struct statx_timestamp;
+#include <sys/stat.h>
+#undef statx
+#undef statx_timestamp
+
+#define AT_STATX_SYNC_TYPE 0x6000
+#define AT_STATX_SYNC_AS_STAT 0x0000
+#define AT_STATX_FORCE_SYNC 0x2000
+#define AT_STATX_DONT_SYNC 0x4000
+
+#ifndef __NR_statx
+#define __NR_statx -1
+#endif
+
+static __attribute__((unused))
+ssize_t statx(int dfd, const char *filename, unsigned flags,
+ unsigned int mask, struct statx *buffer)
+{
+ return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
+}
+
+static void print_time(const char *field, struct statx_timestamp *ts)
+{
+ struct tm tm;
+ time_t tim;
+ char buffer[100];
+ int len;
+
+ tim = ts->tv_sec;
+ if (!localtime_r(&tim, &tm)) {
+ perror("localtime_r");
+ exit(1);
+ }
+ len = strftime(buffer, 100, "%F %T", &tm);
+ if (len == 0) {
+ perror("strftime");
+ exit(1);
+ }
+ printf("%s", field);
+ fwrite(buffer, 1, len, stdout);
+ printf(".%09u", ts->tv_nsec);
+ len = strftime(buffer, 100, "%z", &tm);
+ if (len == 0) {
+ perror("strftime2");
+ exit(1);
+ }
+ fwrite(buffer, 1, len, stdout);
+ printf("\n");
+}
+
+static void dump_statx(struct statx *stx)
+{
+ char buffer[256], ft = '?';
+
+ printf("results=%x\n", stx->stx_mask);
+
+ printf(" ");
+ if (stx->stx_mask & STATX_SIZE)
+ printf(" Size: %-15llu", (unsigned long long)stx->stx_size);
+ if (stx->stx_mask & STATX_BLOCKS)
+ printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks);
+ printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize);
+ if (stx->stx_mask & STATX_TYPE) {
+ switch (stx->stx_mode & S_IFMT) {
+ case S_IFIFO: printf(" FIFO\n"); ft = 'p'; break;
+ case S_IFCHR: printf(" character special file\n"); ft = 'c'; break;
+ case S_IFDIR: printf(" directory\n"); ft = 'd'; break;
+ case S_IFBLK: printf(" block special file\n"); ft = 'b'; break;
+ case S_IFREG: printf(" regular file\n"); ft = '-'; break;
+ case S_IFLNK: printf(" symbolic link\n"); ft = 'l'; break;
+ case S_IFSOCK: printf(" socket\n"); ft = 's'; break;
+ default:
+ printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT);
+ break;
+ }
+ } else {
+ printf(" no type\n");
+ }
+
+ sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor);
+ printf("Device: %-15s", buffer);
+ if (stx->stx_mask & STATX_INO)
+ printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
+ if (stx->stx_mask & STATX_NLINK)
+ printf(" Links: %-5u", stx->stx_nlink);
+ if (stx->stx_mask & STATX_TYPE) {
+ switch (stx->stx_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ printf(" Device type: %u,%u",
+ stx->stx_rdev_major, stx->stx_rdev_minor);
+ break;
+ }
+ }
+ printf("\n");
+
+ if (stx->stx_mask & STATX_MODE)
+ printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c) ",
+ stx->stx_mode & 07777,
+ ft,
+ stx->stx_mode & S_IRUSR ? 'r' : '-',
+ stx->stx_mode & S_IWUSR ? 'w' : '-',
+ stx->stx_mode & S_IXUSR ? 'x' : '-',
+ stx->stx_mode & S_IRGRP ? 'r' : '-',
+ stx->stx_mode & S_IWGRP ? 'w' : '-',
+ stx->stx_mode & S_IXGRP ? 'x' : '-',
+ stx->stx_mode & S_IROTH ? 'r' : '-',
+ stx->stx_mode & S_IWOTH ? 'w' : '-',
+ stx->stx_mode & S_IXOTH ? 'x' : '-');
+ if (stx->stx_mask & STATX_UID)
+ printf("Uid: %5d ", stx->stx_uid);
+ if (stx->stx_mask & STATX_GID)
+ printf("Gid: %5d\n", stx->stx_gid);
+
+ if (stx->stx_mask & STATX_ATIME)
+ print_time("Access: ", &stx->stx_atime);
+ if (stx->stx_mask & STATX_MTIME)
+ print_time("Modify: ", &stx->stx_mtime);
+ if (stx->stx_mask & STATX_CTIME)
+ print_time("Change: ", &stx->stx_ctime);
+ if (stx->stx_mask & STATX_BTIME)
+ print_time(" Birth: ", &stx->stx_btime);
+
+ if (stx->stx_attributes_mask) {
+ unsigned char bits, mbits;
+ int loop, byte;
+
+ static char attr_representation[64 + 1] =
+ /* STATX_ATTR_ flags: */
+ "????????" /* 63-56 */
+ "????????" /* 55-48 */
+ "????????" /* 47-40 */
+ "????????" /* 39-32 */
+ "????????" /* 31-24 0x00000000-ff000000 */
+ "????????" /* 23-16 0x00000000-00ff0000 */
+ "???me???" /* 15- 8 0x00000000-0000ff00 */
+ "?dai?c??" /* 7- 0 0x00000000-000000ff */
+ ;
+
+ printf("Attributes: %016llx (",
+ (unsigned long long)stx->stx_attributes);
+ for (byte = 64 - 8; byte >= 0; byte -= 8) {
+ bits = stx->stx_attributes >> byte;
+ mbits = stx->stx_attributes_mask >> byte;
+ for (loop = 7; loop >= 0; loop--) {
+ int bit = byte + loop;
+
+ if (!(mbits & 0x80))
+ putchar('.'); /* Not supported */
+ else if (bits & 0x80)
+ putchar(attr_representation[63 - bit]);
+ else
+ putchar('-'); /* Not set */
+ bits <<= 1;
+ mbits <<= 1;
+ }
+ if (byte)
+ putchar(' ');
+ }
+ printf(")\n");
+ }
+}
+
+static void dump_hex(unsigned long long *data, int from, int to)
+{
+ unsigned offset, print_offset = 1, col = 0;
+
+ from /= 8;
+ to = (to + 7) / 8;
+
+ for (offset = from; offset < to; offset++) {
+ if (print_offset) {
+ printf("%04x: ", offset * 8);
+ print_offset = 0;
+ }
+ printf("%016llx", data[offset]);
+ col++;
+ if ((col & 3) == 0) {
+ printf("\n");
+ print_offset = 1;
+ } else {
+ printf(" ");
+ }
+ }
+
+ if (!print_offset)
+ printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+ struct statx stx;
+ int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;
+
+ unsigned int mask = STATX_BASIC_STATS | STATX_BTIME;
+
+ for (argv++; *argv; argv++) {
+ if (strcmp(*argv, "-F") == 0) {
+ atflag &= ~AT_STATX_SYNC_TYPE;
+ atflag |= AT_STATX_FORCE_SYNC;
+ continue;
+ }
+ if (strcmp(*argv, "-D") == 0) {
+ atflag &= ~AT_STATX_SYNC_TYPE;
+ atflag |= AT_STATX_DONT_SYNC;
+ continue;
+ }
+ if (strcmp(*argv, "-L") == 0) {
+ atflag &= ~AT_SYMLINK_NOFOLLOW;
+ continue;
+ }
+ if (strcmp(*argv, "-O") == 0) {
+ mask &= ~STATX_BASIC_STATS;
+ continue;
+ }
+ if (strcmp(*argv, "-A") == 0) {
+ atflag |= AT_NO_AUTOMOUNT;
+ continue;
+ }
+ if (strcmp(*argv, "-R") == 0) {
+ raw = 1;
+ continue;
+ }
+
+ memset(&stx, 0xbf, sizeof(stx));
+ ret = statx(AT_FDCWD, *argv, atflag, mask, &stx);
+ printf("statx(%s) = %d\n", *argv, ret);
+ if (ret < 0) {
+ perror(*argv);
+ exit(1);
+ }
+
+ if (raw)
+ dump_hex((unsigned long long *)&stx, 0, sizeof(stx));
+
+ dump_statx(&stx);
+ }
+ return 0;
+}
diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore
new file mode 100644
index 000000000000..823b351d3db9
--- /dev/null
+++ b/samples/watch_queue/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/watch_test
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile
new file mode 100644
index 000000000000..c0db3a6bc524
--- /dev/null
+++ b/samples/watch_queue/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += watch_test
+
+userccflags += -I usr/include
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
new file mode 100644
index 000000000000..24cf7d7a1972
--- /dev/null
+++ b/samples/watch_queue/watch_test.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Use watch_queue API to watch for notifications.
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <limits.h>
+
+// Work around glibc header silliness
+#undef AT_RENAME_NOREPLACE
+#undef AT_RENAME_EXCHANGE
+#undef AT_RENAME_WHITEOUT
+
+#include <linux/watch_queue.h>
+#include <linux/unistd.h>
+#include <linux/keyctl.h>
+
+#ifndef KEYCTL_WATCH_KEY
+#define KEYCTL_WATCH_KEY -1
+#endif
+#ifndef __NR_keyctl
+#define __NR_keyctl -1
+#endif
+
+#define BUF_SIZE 256
+
+static long keyctl_watch_key(int key, int watch_fd, int watch_id)
+{
+ return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id);
+}
+
+static const char *key_subtypes[256] = {
+ [NOTIFY_KEY_INSTANTIATED] = "instantiated",
+ [NOTIFY_KEY_UPDATED] = "updated",
+ [NOTIFY_KEY_LINKED] = "linked",
+ [NOTIFY_KEY_UNLINKED] = "unlinked",
+ [NOTIFY_KEY_CLEARED] = "cleared",
+ [NOTIFY_KEY_REVOKED] = "revoked",
+ [NOTIFY_KEY_INVALIDATED] = "invalidated",
+ [NOTIFY_KEY_SETATTR] = "setattr",
+};
+
+static void saw_key_change(struct watch_notification *n, size_t len)
+{
+ struct key_notification *k = (struct key_notification *)n;
+
+ if (len != sizeof(struct key_notification)) {
+ fprintf(stderr, "Incorrect key message length\n");
+ return;
+ }
+
+ printf("KEY %08x change=%u[%s] aux=%u\n",
+ k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
+}
+
+/*
+ * Consume and display events.
+ */
+static void consumer(int fd)
+{
+ unsigned char buffer[433], *p, *end;
+ union {
+ struct watch_notification n;
+ unsigned char buf1[128];
+ } n;
+ ssize_t buf_len;
+
+ for (;;) {
+ buf_len = read(fd, buffer, sizeof(buffer));
+ if (buf_len == -1) {
+ perror("read");
+ exit(1);
+ }
+
+ if (buf_len == 0) {
+ printf("-- END --\n");
+ return;
+ }
+
+ if (buf_len > sizeof(buffer)) {
+ fprintf(stderr, "Read buffer overrun: %zd\n", buf_len);
+ return;
+ }
+
+ printf("read() = %zd\n", buf_len);
+
+ p = buffer;
+ end = buffer + buf_len;
+ while (p < end) {
+ size_t largest, len;
+
+ largest = end - p;
+ if (largest > 128)
+ largest = 128;
+ if (largest < sizeof(struct watch_notification)) {
+ fprintf(stderr, "Short message header: %zu\n", largest);
+ return;
+ }
+ memcpy(&n, p, largest);
+
+ printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n",
+ p - buffer, n.n.type, n.n.subtype, n.n.info);
+
+ len = n.n.info & WATCH_INFO_LENGTH;
+ if (len < sizeof(n.n) || len > largest) {
+ fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest);
+ exit(1);
+ }
+
+ switch (n.n.type) {
+ case WATCH_TYPE_META:
+ switch (n.n.subtype) {
+ case WATCH_META_REMOVAL_NOTIFICATION:
+ printf("REMOVAL of watchpoint %08x\n",
+ (n.n.info & WATCH_INFO_ID) >>
+ WATCH_INFO_ID__SHIFT);
+ break;
+ case WATCH_META_LOSS_NOTIFICATION:
+ printf("-- LOSS --\n");
+ break;
+ default:
+ printf("other meta record\n");
+ break;
+ }
+ break;
+ case WATCH_TYPE_KEY_NOTIFY:
+ saw_key_change(&n.n, len);
+ break;
+ default:
+ printf("other type\n");
+ break;
+ }
+
+ p += len;
+ }
+ }
+}
+
+static struct watch_notification_filter filter = {
+ .nr_filters = 1,
+ .filters = {
+ [0] = {
+ .type = WATCH_TYPE_KEY_NOTIFY,
+ .subtype_filter[0] = UINT_MAX,
+ },
+ },
+};
+
+int main(int argc, char **argv)
+{
+ int pipefd[2], fd;
+
+ if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) {
+ perror("pipe2");
+ exit(1);
+ }
+ fd = pipefd[0];
+
+ if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) {
+ perror("watch_queue(size)");
+ exit(1);
+ }
+
+ if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) {
+ perror("watch_queue(filter)");
+ exit(1);
+ }
+
+ if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) {
+ perror("keyctl");
+ exit(1);
+ }
+
+ if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) {
+ perror("keyctl");
+ exit(1);
+ }
+
+ consumer(fd);
+ exit(0);
+}
diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore
new file mode 100644
index 000000000000..a70a0150ed9f
--- /dev/null
+++ b/samples/watchdog/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/watchdog-simple
diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile
new file mode 100644
index 000000000000..ab39d23dc96b
--- /dev/null
+++ b/samples/watchdog/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += watchdog-simple
diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c
new file mode 100644
index 000000000000..9ce66d2ca2a9
--- /dev/null
+++ b/samples/watchdog/watchdog-simple.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(void)
+{
+ int fd = open("/dev/watchdog", O_WRONLY);
+ int ret = 0;
+ if (fd == -1) {
+ perror("watchdog");
+ exit(EXIT_FAILURE);
+ }
+ while (1) {
+ ret = write(fd, "\0", 1);
+ if (ret != 1) {
+ ret = -1;
+ break;
+ }
+ sleep(10);
+ }
+ close(fd);
+ return ret;
+}